Skip to content

Commit 37580ac

Browse files
committed
fixed conflict between camel-tools and others. fixed auto install
1 parent 12c0417 commit 37580ac

4 files changed

Lines changed: 57 additions & 15 deletions

File tree

dalla_data_processing/cli.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,15 @@ def pack(
651651
tokenizer = RBPETokenizer.from_pretrained(config_data["tokenizer_path"])
652652
except ImportError:
653653
logger.error("Missing rbpe package")
654-
logger.error("Install with: pip install rbpe")
654+
logger.error(
655+
"rbpe is not included in the default installation due to "
656+
"dependency conflicts with camel-tools (transformers version requirements)"
657+
)
658+
logger.error("Install separately with: pip install rbpe")
659+
logger.error(
660+
"Note: Installing rbpe may require a separate environment "
661+
"if you also use dedup/stem/quality features"
662+
)
655663
sys.exit(1)
656664
else:
657665
try:

dalla_data_processing/quality/checker.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from types import MethodType
1212
from typing import Any
1313

14+
from camel_tools.data.catalogue import Catalogue
1415
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
1516
from camel_tools.disambig.mle import MLEDisambiguator
1617
from datasets import Dataset
@@ -53,6 +54,25 @@ def __init__(self, timeout: int = 3600, model: str = "mle", use_gpu: bool = Fals
5354

5455
def _init_disambiguator(self):
5556
"""Initialize and configure the disambiguator with caching."""
57+
# Install required CAMeL Tools packages based on model type
58+
logger.info("Checking CAMeL Tools data packages...")
59+
catalogue = Catalogue.load_catalogue()
60+
61+
try:
62+
catalogue.download_package("morphology-db-msa-r13")
63+
catalogue.download_package("disambig-mle-calima-msa-r13")
64+
logger.info("msa-r13 packages installed")
65+
except Exception as e:
66+
logger.warning(f"Package installation warning: {e}")
67+
68+
# Install BERT package if using BERT model
69+
if self.model == "bert":
70+
try:
71+
catalogue.download_package("disambig-bert-unfactored-all")
72+
logger.info("BERT package installed")
73+
except Exception as e:
74+
logger.warning(f"BERT package installation warning: {e}")
75+
5676
if self.model == "mle":
5777
self.disambiguator = MLEDisambiguator.pretrained()
5878
logger.info("MLE disambiguator loaded")

dalla_data_processing/stemming/stemmer.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -473,12 +473,19 @@ def stem_dataset(
473473
catalogue = Catalogue.load_catalogue()
474474
try:
475475
catalogue.download_package("morphology-db-msa-r13")
476-
if model == "mle":
477-
catalogue.download_package("disambig-mle-calima-msa-r13")
478-
# For BERT, let it download automatically when pretrained() is called
479-
logger.info("CAMeL Tools data packages ready")
476+
catalogue.download_package("disambig-mle-calima-msa-r13")
477+
logger.info("msa-r13 packages installed")
480478
except Exception as e:
481-
logger.warning(f"Could not verify CAMeL packages: {e}")
479+
logger.warning(f"Package installation warning: {e}")
480+
481+
if model == "bert":
482+
try:
483+
catalogue.download_package("disambig-bert-unfactored-all")
484+
logger.info("BERT package installed")
485+
except Exception as e:
486+
logger.warning(f"BERT package installation warning: {e}")
487+
488+
logger.info("CAMeL Tools data packages ready")
482489

483490
logger.info("Loading additional words lists...")
484491
words_dir = os.path.join(os.path.dirname(__file__), "data")
@@ -597,15 +604,21 @@ def stem(
597604
if not all(isinstance(t, str) for t in text_list):
598605
raise TypeError("All items in text list must be strings")
599606

600-
# Initialize disambiguator (cached globally if possible)
601607
logger.info(f"Initializing {model.upper()} disambiguator...")
602608
catalogue = Catalogue.load_catalogue()
603609
try:
604610
catalogue.download_package("morphology-db-msa-r13")
605-
if model == "mle":
606-
catalogue.download_package("disambig-mle-calima-msa-r13")
611+
catalogue.download_package("disambig-mle-calima-msa-r13")
612+
logger.info("msa-r13 packages installed")
607613
except Exception as e:
608-
logger.warning(f"Could not verify CAMeL packages: {e}")
614+
logger.warning(f"Package installation warning: {e}")
615+
616+
if model == "bert":
617+
try:
618+
catalogue.download_package("disambig-bert-unfactored-all")
619+
logger.info("BERT package installed")
620+
except Exception as e:
621+
logger.warning(f"BERT package installation warning: {e}")
609622

610623
if model == "mle":
611624
disambiguator = MLEDisambiguator.pretrained("calima-msa-r13", cache_size=1_000_000)

pyproject.toml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ authors = [
1111
{name = "Digital Research Unit - Arab Center", email = "dru@dohainstitute.edu.qa"}
1212
]
1313
readme = "README.md"
14-
requires-python = ">=3.12"
14+
requires-python = ">=3.12,<3.13"
1515
keywords = ["arabic", "nlp", "data-processing", "deduplication", "stemming", "readability", "quality"]
1616
classifiers = [
1717
"Intended Audience :: Developers",
@@ -39,23 +39,24 @@ dev = [
3939
"pre-commit>=3.0.0",
4040
]
4141
dedup = [
42-
"camel-tools>=1.5.0",
42+
"camel-tools==1.5.7",
4343
]
4444
dedup-native = [
4545
"cffi>=1.15.0",
4646
]
4747
stem = [
48-
"camel-tools>=1.5.0",
48+
"camel-tools==1.5.7",
4949
]
5050
quality = [
51-
"camel-tools>=1.5.0",
51+
"camel-tools==1.5.7",
5252
]
5353
readability = [
5454
"textstat>=0.7.0",
5555
]
5656
pack = [
5757
"sentencepiece>=0.2.0",
58-
"rbpe",
58+
# "rbpe", # excluded due to transformers version conflict with camel-tools
59+
# users should install separately if needed: pip install rbpe
5960
"pyyaml",
6061
]
6162
all = [

0 commit comments

Comments
 (0)