|
| 1 | +import os |
| 2 | +import logging |
| 3 | +import time |
| 4 | + |
| 5 | +from dotenv import load_dotenv |
| 6 | +import s3fs |
| 7 | +from openai import OpenAI |
| 8 | + |
| 9 | +from src.agents.NaiveCode2Text.config_naive import \ |
| 10 | + MODEL, TEMPERATURE, OUTPUT_PATH, N_CODES, POPULATION_PATH, CODE_COLUMN, \ |
| 11 | + OUTPUT_FORMAT, BATCH_SIZE, LANGUAGE, NB_LABELS, PROMPT_PATH, \ |
| 12 | + INCLUDES_DIVIDER, EXAMPLES_DIVIDER, EXCLUDE_DIVIDER, RANDOM_SPEC_SAMPLING, \ |
| 13 | + RANDOM_INCLUDES_GEOM_PROB, RANDOM_INCLUDES_MIN, RANDOM_INCLUDES_MAX, \ |
| 14 | + RANDOM_EXAMPLES_GEOM_PROB, RANDOM_EXAMPLES_MIN, RANDOM_EXAMPLES_MAX |
| 15 | +from src.agents.NaiveCode2Text.prompts import prompt_builder, label_generator |
| 16 | +from src.agents.NaiveCode2Text.code_retrieval import code_sampler, code_specifier |
| 17 | +from src.neo4j_graph.graph import Graph, Neo4JConfig |
| 18 | + |
| 19 | +# Logger |
| 20 | +logging.basicConfig(level=logging.INFO) |
| 21 | +logger = logging.getLogger(__name__) |
| 22 | + |
| 23 | +# Environment |
| 24 | +load_dotenv(override=True) |
| 25 | + |
| 26 | +if __name__ == "__main__": |
| 27 | + # Clock for speed testing |
| 28 | + if OUTPUT_FORMAT == ".txt": |
| 29 | + start = time.perf_counter() |
| 30 | + |
| 31 | + # Access configurations |
| 32 | + FS = s3fs.S3FileSystem( |
| 33 | + client_kwargs={'endpoint_url': os.environ["AWS_ENDPOINT_URL"]}, |
| 34 | + key=os.environ["AWS_ACCESS_KEY_ID"], |
| 35 | + secret=os.environ["AWS_SECRET_ACCESS_KEY"], |
| 36 | + token=os.environ["AWS_SESSION_TOKEN"] |
| 37 | + ) |
| 38 | + |
| 39 | + LLM_API_KEY = os.environ["LLM_API_KEY"] |
| 40 | + LLM_URL = os.environ["LLM_URL"] |
| 41 | + LLM_CLIENT = OpenAI(api_key=LLM_API_KEY, base_url=LLM_URL) |
| 42 | + |
| 43 | + # Sampling from original data |
| 44 | + logger.info("Sampling from data...") |
| 45 | + code_list = code_sampler.sample_codes_lazy( |
| 46 | + fs=FS, |
| 47 | + population_path=POPULATION_PATH, |
| 48 | + code_column=CODE_COLUMN, |
| 49 | + n_codes=N_CODES |
| 50 | + ) |
| 51 | + |
| 52 | + # NAF to NACE |
| 53 | + logger.info("Transforming codes from NAF to NACE...") |
| 54 | + code_list = [code_specifier.NAF_to_NACE(code) for code in code_list] |
| 55 | + |
| 56 | + # Neo4j connection |
| 57 | + logger.info("Connecting to Neo4j graph...") |
| 58 | + notice_graph = Graph(Neo4JConfig( |
| 59 | + url=os.environ["NEO4J_URL"], |
| 60 | + username=os.environ["NEO4J_USERNAME"], |
| 61 | + password=os.environ["NEO4J_PWD"] |
| 62 | + )) |
| 63 | + |
| 64 | + # Define an automatic name for output |
| 65 | + file_name = f"generation_{MODEL}_temp{TEMPERATURE}".replace(":", "-").replace(".", "") \ |
| 66 | + + OUTPUT_FORMAT |
| 67 | + FINAL_PATH = OUTPUT_PATH + file_name |
| 68 | + |
| 69 | + # Prompt generation |
| 70 | + logger.info("Generating prompts...") |
| 71 | + |
| 72 | + name_list = [] |
| 73 | + label_list = [] |
| 74 | + |
| 75 | + # Model set up |
| 76 | + LabelGenerationModel = label_generator.build_label_generation_model(NB_LABELS) |
| 77 | + |
| 78 | + system_prompt = prompt_builder.build_system_prompt( |
| 79 | + prompt_path=PROMPT_PATH, |
| 80 | + language=LANGUAGE, |
| 81 | + nb_labels=NB_LABELS |
| 82 | + ) |
| 83 | + |
| 84 | + for i, code in enumerate(code_list): |
| 85 | + logger.info(f"Processing step {i+1}...") |
| 86 | + |
| 87 | + # Get code details from Neo4j |
| 88 | + code_details = code_specifier.get_code_information( |
| 89 | + graph=notice_graph, |
| 90 | + code=code |
| 91 | + ) |
| 92 | + |
| 93 | + # For exportation purpose |
| 94 | + name_list.append(code_details["name"]) |
| 95 | + |
| 96 | + # Build prompts |
| 97 | + user_prompt = prompt_builder.build_user_prompt( |
| 98 | + code_details=code_details, |
| 99 | + language=LANGUAGE, |
| 100 | + nb_labels=NB_LABELS, |
| 101 | + includes_divider=INCLUDES_DIVIDER, |
| 102 | + examples_divider=EXAMPLES_DIVIDER, |
| 103 | + excludes_divider=EXCLUDE_DIVIDER, |
| 104 | + random_spec_sampling=RANDOM_SPEC_SAMPLING, |
| 105 | + random_includes_geom_prob=RANDOM_INCLUDES_GEOM_PROB, |
| 106 | + random_includes_min=RANDOM_INCLUDES_MIN, |
| 107 | + random_includes_max=RANDOM_INCLUDES_MAX, |
| 108 | + random_examples_geom_prob=RANDOM_EXAMPLES_GEOM_PROB, |
| 109 | + random_examples_min=RANDOM_EXAMPLES_MIN, |
| 110 | + random_examples_max=RANDOM_EXAMPLES_MAX |
| 111 | + ) |
| 112 | + |
| 113 | + # Ask the chatbot |
| 114 | + generation = label_generator.ask_model( |
| 115 | + system_prompt=system_prompt, |
| 116 | + user_prompt=user_prompt, |
| 117 | + llm_client=LLM_CLIENT, |
| 118 | + model=MODEL, |
| 119 | + temperature=TEMPERATURE, |
| 120 | + LabelGeneration=LabelGenerationModel |
| 121 | + ) |
| 122 | + |
| 123 | + label_list.append(generation.labels) |
| 124 | + |
| 125 | + if OUTPUT_FORMAT == ".parquet" and (i+1) % BATCH_SIZE == 0: |
| 126 | + logger.info("Saving intermediate results...") |
| 127 | + label_generator.export_to_parquet( |
| 128 | + codes=code_list[i+1-BATCH_SIZE:i+1], |
| 129 | + names=name_list, |
| 130 | + labels=label_list, |
| 131 | + file_path=FINAL_PATH, |
| 132 | + fs=FS |
| 133 | + ) |
| 134 | + label_list = [] |
| 135 | + name_list = [] |
| 136 | + |
| 137 | + end = time.perf_counter() |
| 138 | + |
| 139 | + if OUTPUT_FORMAT == ".txt": |
| 140 | + logger.info("Saving results to txt...") |
| 141 | + label_generator.export_to_txt( |
| 142 | + codes=code_list, |
| 143 | + names=name_list, |
| 144 | + labels=label_list, |
| 145 | + file_path=FINAL_PATH, |
| 146 | + generation_time=end-start |
| 147 | + ) |
| 148 | + |
| 149 | + elif OUTPUT_FORMAT == ".parquet": |
| 150 | + logger.info("Saving final results...") |
| 151 | + first_unsaved_index = BATCH_SIZE*(N_CODES//BATCH_SIZE) |
| 152 | + if first_unsaved_index < N_CODES: |
| 153 | + label_generator.export_to_parquet( |
| 154 | + codes=code_list[BATCH_SIZE*(N_CODES//BATCH_SIZE):], |
| 155 | + names=name_list, |
| 156 | + labels=label_list, |
| 157 | + file_path=FINAL_PATH, |
| 158 | + fs=FS |
| 159 | + ) |
0 commit comments