InseeFrLab
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎explorations.py‎
Lines changed: 3 additions & 0 deletions b/‎explorations.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/agents/NaiveCode2Text/__init__.py‎ b/‎src/agents/NaiveCode2Text/__init__.py‎
diff --git a/‎src/agents/NaiveCode2Text/code_retrieval/code_sampler.py‎
Lines changed: 71 additions & 0 deletions b/‎src/agents/NaiveCode2Text/code_retrieval/code_sampler.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎src/agents/NaiveCode2Text/code_retrieval/code_specifier.py‎
Lines changed: 57 additions & 0 deletions b/‎src/agents/NaiveCode2Text/code_retrieval/code_specifier.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎src/agents/NaiveCode2Text/config_naive.py‎
Lines changed: 34 additions & 0 deletions b/‎src/agents/NaiveCode2Text/config_naive.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎src/agents/NaiveCode2Text/naive_code2text.py‎
Lines changed: 159 additions & 0 deletions b/‎src/agents/NaiveCode2Text/naive_code2text.py‎
Lines changed: 159 additions & 0 deletions
@@ -1,3 +1,6 @@
+# Personnal usage
+test.ipynb
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[codz]
 
@@ -116,6 +116,9 @@ def sample_codes(fs: s3fs.S3FileSystem, population_path: str, code_column: str,
 
     return sampled[code_column].to_numpy()
 
+    label_idx = n_nace_nodes + i
+    if target_code in codes_dict: 
+        label_to_code_idx[label_idx] = codes_dict[target_code]
 
 codes = sample_codes(
     fs=fs,
 
@@ -26,6 +26,7 @@ dependencies = [
     "pca>=2.10.1",
     "plotly>=6.5.1",
     "polars>=1.38.1",
+    "pyarrow>=23.0.1",
     "s3fs>=2024.12.0",
     "transformers>=4.57.3",
     "umap-learn>=0.5.11",
 
@@ -0,0 +1,71 @@
+import polars as pl
+import s3fs
+import numpy as np
+
+
+def sample_codes(
+        fs: s3fs.S3FileSystem,
+        population_path: str,
+        code_column: str,
+        n_codes: int
+        ) -> np.ndarray:
+    """
+    Sample codes with replacement using dataframes from Polars.
+
+    Args:
+        fs (S3FileSystem): The filesystem for importation.
+        population_path (str): The path of the parquet file of the population.
+        code_column (str): The name of the column for codes.
+        n_codes (int): The number of codes to sample.
+
+    Returns:
+        numpy.ndarray: An array of n_codes codes sampled with replacement.
+    """
+
+    with fs.open(population_path, 'rb') as f:
+        df = pl.read_parquet(f)
+
+    sampled = df.select(code_column).sample(n=n_codes, with_replacement=True)
+
+    return sampled[code_column].to_numpy()
+
+
+def sample_codes_lazy(
+        fs: s3fs.S3FileSystem,
+        population_path: str,
+        code_column: str,
+        n_codes: str
+        ) -> np.ndarray:
+    """
+    Sample codes with replacement using lazyframes from Polars.
+
+    Args:
+        fs (S3FileSystem): The filesystem for importation.
+        population_path (str): The path of the parquet file of the population.
+        code_column (str): The name of the column for codes.
+        n_codes (int): The number of codes to sample.
+
+    Returns:
+        numpy.ndarray: An array of n_codes codes sampled with replacement.
+    """
+
+    with fs.open(population_path, 'rb') as f:
+        lf = (
+            pl.scan_parquet(f)
+            .with_row_index("row_id")
+        )
+
+        total_rows = lf.select(pl.len()).collect().item()
+
+        random_ids = (
+            pl.Series("row_id", range(total_rows))
+            .sample(n=n_codes, with_replacement=True)
+            .to_frame()
+            .lazy()
+        )
+
+        sampled = lf.join(random_ids, on="row_id", how="inner")
+
+    df = sampled.collect()
+
+    return df[code_column].to_numpy()
@@ -0,0 +1,57 @@
+from src.neo4j_graph.graph import Graph
+
+
+def get_code_information(
+        graph: Graph,
+        code: str
+        ) -> dict:
+    """
+    Retrieve code specifications from a Neo4j graph
+
+    Args:
+        graph (Graph from local library): The Neo4j graph.
+        code (str): The code to specify.
+
+    Returns:
+        dict: Every accessible information of the code in the graph.
+    """
+
+    query = """
+    MATCH (node {CODE: $code})
+    OPTIONAL MATCH (node)<-[:HAS_CHILD]-(parent)
+    OPTIONAL MATCH (node)-[:HAS_CHILD]->(child)
+    WITH node, parent, collect({code: child.CODE, name: child.NAME}) as children
+    RETURN node.CODE as code,
+    node.LEVEL as level,
+    node.NAME as name,
+    node.text as description,
+    node.Includes as includes,
+    node.IncludesAlso as includes_also,
+    node.Excludes as excludes,
+    node.Implementation_rule as implementation_rule,
+    parent.CODE as parent_code,
+    children,
+    size(children) as children_count
+    """
+    result = graph.graph.query(query, params={"code": code})
+
+    if not result:
+        print("No result in get_code_information")
+        return ()
+
+    return result[0]
+
+
+def NAF_to_NACE(
+        code: str
+        ) -> str:
+    """
+    For the case of NAF code format (DDDDL), transform it into NACE (DD.DD).
+
+    Args:
+        code (str): The code in NAF format to transform.
+
+    Returns:
+        str: The code in NACE format.
+    """
+    return code[0:2] + '.' + code[2:4]
@@ -0,0 +1,34 @@
+# For code sampling
+POPULATION_PATH = "projet-ape/data/08112022_27102024/naf2025/split/df_train.parquet"
+CODE_COLUMN = "nace2025"
+
+# For prompt creation
+PROMPT_PATH = "src/agents/NaiveCode2Text/prompts/"
+
+# To retrieve specifications of every code correctly:
+INCLUDES_DIVIDER = "\n-"
+EXAMPLES_DIVIDER = "\n"
+EXCLUDE_DIVIDER = "\n"
+
+# Randomization for specifications
+RANDOM_SPEC_SAMPLING = True
+RANDOM_INCLUDES_GEOM_PROB = 0.3
+RANDOM_INCLUDES_MIN = 1
+RANDOM_INCLUDES_MAX = None      # None = up to the max number of includes
+RANDOM_EXAMPLES_GEOM_PROB = 0.2
+RANDOM_EXAMPLES_MIN = 1
+RANDOM_EXAMPLES_MAX = None      # None = up to the max number of examples per include
+
+# Exportation
+OUTPUT_PATH = "projet-ape/synthetic_data_test/naive/"
+OUTPUT_FORMAT = ".parquet"          # .txt or .parquet
+BATCH_SIZE = 5               # If choosing .parquet output format
+
+# LLM Hyperparameters
+MODEL = "gpt-oss:20b"
+TEMPERATURE = 1.8
+LANGUAGE = "English"
+
+# Generation specifications
+N_CODES = 12                     # Number of codes to sample
+NB_LABELS = 10                  # Number of labels to generate per code
@@ -0,0 +1,159 @@
+import os
+import logging
+import time
+
+from dotenv import load_dotenv
+import s3fs
+from openai import OpenAI
+
+from src.agents.NaiveCode2Text.config_naive import \
+    MODEL, TEMPERATURE, OUTPUT_PATH, N_CODES, POPULATION_PATH, CODE_COLUMN, \
+    OUTPUT_FORMAT, BATCH_SIZE, LANGUAGE, NB_LABELS, PROMPT_PATH, \
+    INCLUDES_DIVIDER, EXAMPLES_DIVIDER, EXCLUDE_DIVIDER, RANDOM_SPEC_SAMPLING, \
+    RANDOM_INCLUDES_GEOM_PROB, RANDOM_INCLUDES_MIN, RANDOM_INCLUDES_MAX, \
+    RANDOM_EXAMPLES_GEOM_PROB, RANDOM_EXAMPLES_MIN, RANDOM_EXAMPLES_MAX
+from src.agents.NaiveCode2Text.prompts import prompt_builder, label_generator
+from src.agents.NaiveCode2Text.code_retrieval import code_sampler, code_specifier
+from src.neo4j_graph.graph import Graph, Neo4JConfig
+
+# Logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Environment
+load_dotenv(override=True)
+
+if __name__ == "__main__":
+    # Clock for speed testing
+    if OUTPUT_FORMAT == ".txt":
+        start = time.perf_counter()
+
+    # Access configurations
+    FS = s3fs.S3FileSystem(
+        client_kwargs={'endpoint_url': os.environ["AWS_ENDPOINT_URL"]},
+        key=os.environ["AWS_ACCESS_KEY_ID"],
+        secret=os.environ["AWS_SECRET_ACCESS_KEY"],
+        token=os.environ["AWS_SESSION_TOKEN"]
+    )
+
+    LLM_API_KEY = os.environ["LLM_API_KEY"]
+    LLM_URL = os.environ["LLM_URL"]
+    LLM_CLIENT = OpenAI(api_key=LLM_API_KEY, base_url=LLM_URL)
+
+    # Sampling from original data
+    logger.info("Sampling from data...")
+    code_list = code_sampler.sample_codes_lazy(
+        fs=FS,
+        population_path=POPULATION_PATH,
+        code_column=CODE_COLUMN,
+        n_codes=N_CODES
+    )
+
+    # NAF to NACE
+    logger.info("Transforming codes from NAF to NACE...")
+    code_list = [code_specifier.NAF_to_NACE(code) for code in code_list]
+
+    # Neo4j connection
+    logger.info("Connecting to Neo4j graph...")
+    notice_graph = Graph(Neo4JConfig(
+        url=os.environ["NEO4J_URL"],
+        username=os.environ["NEO4J_USERNAME"],
+        password=os.environ["NEO4J_PWD"]
+    ))
+
+    # Define an automatic name for output
+    file_name = f"generation_{MODEL}_temp{TEMPERATURE}".replace(":", "-").replace(".", "") \
+                + OUTPUT_FORMAT
+    FINAL_PATH = OUTPUT_PATH + file_name
+
+    # Prompt generation
+    logger.info("Generating prompts...")
+
+    name_list = []
+    label_list = []
+
+    # Model set up
+    LabelGenerationModel = label_generator.build_label_generation_model(NB_LABELS)
+
+    system_prompt = prompt_builder.build_system_prompt(
+            prompt_path=PROMPT_PATH,
+            language=LANGUAGE,
+            nb_labels=NB_LABELS
+        )
+
+    for i, code in enumerate(code_list):
+        logger.info(f"Processing step {i+1}...")
+
+        # Get code details from Neo4j
+        code_details = code_specifier.get_code_information(
+            graph=notice_graph,
+            code=code
+            )
+
+        # For exportation purpose
+        name_list.append(code_details["name"])
+
+        # Build prompts
+        user_prompt = prompt_builder.build_user_prompt(
+            code_details=code_details,
+            language=LANGUAGE,
+            nb_labels=NB_LABELS,
+            includes_divider=INCLUDES_DIVIDER,
+            examples_divider=EXAMPLES_DIVIDER,
+            excludes_divider=EXCLUDE_DIVIDER,
+            random_spec_sampling=RANDOM_SPEC_SAMPLING,
+            random_includes_geom_prob=RANDOM_INCLUDES_GEOM_PROB,
+            random_includes_min=RANDOM_INCLUDES_MIN,
+            random_includes_max=RANDOM_INCLUDES_MAX,
+            random_examples_geom_prob=RANDOM_EXAMPLES_GEOM_PROB,
+            random_examples_min=RANDOM_EXAMPLES_MIN,
+            random_examples_max=RANDOM_EXAMPLES_MAX
+            )
+
+        # Ask the chatbot
+        generation = label_generator.ask_model(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            llm_client=LLM_CLIENT,
+            model=MODEL,
+            temperature=TEMPERATURE,
+            LabelGeneration=LabelGenerationModel
+        )
+
+        label_list.append(generation.labels)
+
+        if OUTPUT_FORMAT == ".parquet" and (i+1) % BATCH_SIZE == 0:
+            logger.info("Saving intermediate results...")
+            label_generator.export_to_parquet(
+                codes=code_list[i+1-BATCH_SIZE:i+1],
+                names=name_list,
+                labels=label_list,
+                file_path=FINAL_PATH,
+                fs=FS
+                )
+            label_list = []
+            name_list = []
+
+    end = time.perf_counter()
+
+    if OUTPUT_FORMAT == ".txt":
+        logger.info("Saving results to txt...")
+        label_generator.export_to_txt(
+            codes=code_list,
+            names=name_list,
+            labels=label_list,
+            file_path=FINAL_PATH,
+            generation_time=end-start
+            )
+
+    elif OUTPUT_FORMAT == ".parquet":
+        logger.info("Saving final results...")
+        first_unsaved_index = BATCH_SIZE*(N_CODES//BATCH_SIZE)
+        if first_unsaved_index < N_CODES:
+            label_generator.export_to_parquet(
+                codes=code_list[BATCH_SIZE*(N_CODES//BATCH_SIZE):],
+                names=name_list,
+                labels=label_list,
+                file_path=FINAL_PATH,
+                fs=FS
+                )