1919
2020query = """
2121MATCH path = (root)-[*]->(n)
22- WHERE n.LEVEL = 5
22+ WHERE n.FINAL = 1
2323 AND n.embedding IS NOT NULL
2424 AND root.LEVEL = 0
2525RETURN n.embedding as embedding,
2626 n.NAME as name,
27+ n.CODE as code,
2728 [node IN nodes(path) | node.CODE] as path_codes,
2829 [node IN nodes(path) | node.LEVEL] as path_levels
2930"""
3334embeddings = []
3435names = []
3536paths = []
37+ codes_dict = {}
3638
37- for record in results :
39+ for idx , record in enumerate ( results ) :
3840 embeddings .append (record ["embedding" ])
3941 names .append (record ["name" ])
42+ code = record ["code" ]
43+ code_clean = code .replace ("." , "" ).replace (" " , "" )
44+ codes_dict [code_clean ] = idx
4045
4146 path_str = " → " .join ([
4247 name for lvl , name in zip (record ["path_levels" ], record ["path_codes" ])
4550
4651
4752print (f"Nœuds récupérés: { len (names )} " )
48- print (embeddings )
4953
54+ n_nace_nodes = len (embeddings )
5055
51- # %%
52- # Add a query in the embedding space
53-
54- queries = ["Je vends des croissants" , "Livreur de taxi" , "Coiffeur" ]
55- emb_model = OpenAIEmbeddings (
56- model = os .environ ['EMBEDDING_MODEL' ],
57- openai_api_base = os .environ ['URL_EMBEDDING_API' ],
58- openai_api_key = "EMPTY" ,
59- tiktoken_enabled = False ,
60- )
61- for i , query in iter (queries ):
62- query_emb = emb_model .embed_query (query )
63- embeddings .append (query_emb )
64- names .append (f"Query { i } " )
65- paths .append (query )
66-
67-
68- # %% UMAP
69- reducer = umap .UMAP (random_state = 42 , n_neighbors = 10 , min_dist = 0.1 )
70- embeddings = np .array (embeddings )
71- coords = reducer .fit_transform (embeddings )
72- X , Y = coords .T
73-
74- # %% Visualisation interactive
75- fig = go .Figure ()
76-
77- fig .add_trace (go .Scatter (
78- x = X , y = Y ,
79- mode = 'markers' ,
80- marker = dict (
81- size = 10 ,
82- color = np .arange (len (X )), # Couleur par index
83- colorscale = 'Viridis' ,
84- showscale = True ,
85- line = dict (width = 0.5 , color = 'white' )
86- ),
87- text = [f"<b>{ name } </b><br><br>{ path } " for name , path in zip (names , paths )],
88- hovertemplate = '%{text}<extra></extra>'
89- ))
9056
91- fig .update_layout (
92- title = "Nœuds de niveau 5" ,
93- xaxis_title = "UMAP 1" ,
94- yaxis_title = "UMAP 2" ,
95- width = 1200 ,
96- height = 800 ,
97- hovermode = 'closest' ,
98- plot_bgcolor = 'white' ,
99- xaxis = dict (showgrid = True , gridcolor = 'lightgray' ),
100- yaxis = dict (showgrid = True , gridcolor = 'lightgray' )
101- )
10257
103- fig .show ()
10458# %%
10559import os
10660import s3fs
10761os .environ ["AWS_ACCESS_KEY_ID" ] = 'UN8E5UMY78E5H4AKC7HF'
10862os .environ ["AWS_SECRET_ACCESS_KEY" ] = 'fSUt5up9uh4qfyHH4LIQ6J0GiQp42eFc+fKWrRS2'
10963os .environ ["AWS_SESSION_TOKEN" ] = 'eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3NLZXkiOiJVTjhFNVVNWTc4RTVINEFLQzdIRiIsImFsbG93ZWQtb3JpZ2lucyI6WyIqIl0sImF1ZCI6WyJtaW5pby1kYXRhbm9kZSIsIm9ueXhpYSIsImFjY291bnQiXSwiYXV0aF90aW1lIjoxNzcwNjI4NzkzLCJhenAiOiJvbnl4aWEiLCJlbWFpbCI6InRoZW8uZmVycnlAaW5zZWUuZnIiLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwiZXhwIjoxNzcxNTEyODA1LCJmYW1pbHlfbmFtZSI6IkZlcnJ5IiwiZ2l2ZW5fbmFtZSI6IlRoZW8iLCJncm91cHMiOlsiVVNFUl9PTllYSUEiLCJhcGUiLCJtb2RlbHMtaGYiLCJzc3BsYWIiXSwiaWF0IjoxNzcwOTA4MDA0LCJpc3MiOiJodHRwczovL2F1dGgubGFiLnNzcGNsb3VkLmZyL2F1dGgvcmVhbG1zL3NzcGNsb3VkIiwianRpIjoib25ydHJ0OjllMjk1ZmEzLTliNmMtNjZjYi0yMWE0LTA2NDlhNGVkMWUzYSIsImxvY2FsZSI6ImZyIiwibmFtZSI6IlRoZW8gRmVycnkiLCJwb2xpY3kiOiJzdHNvbmx5IiwicHJlZmVycmVkX3VzZXJuYW1lIjoidGhlb2YiLCJyZWFsbV9hY2Nlc3MiOnsicm9sZXMiOlsib2ZmbGluZV9hY2Nlc3MiLCJ1bWFfYXV0aG9yaXphdGlvbiIsInZpcCIsImRlZmF1bHQtcm9sZXMtc3NwY2xvdWQiXX0sInJlc291cmNlX2FjY2VzcyI6eyJhY2NvdW50Ijp7InJvbGVzIjpbIm1hbmFnZS1hY2NvdW50IiwibWFuYWdlLWFjY291bnQtbGlua3MiLCJ2aWV3LXByb2ZpbGUiXX19LCJyb2xlcyI6WyJvZmZsaW5lX2FjY2VzcyIsInVtYV9hdXRob3JpemF0aW9uIiwidmlwIiwiZGVmYXVsdC1yb2xlcy1zc3BjbG91ZCJdLCJzY29wZSI6Im9wZW5pZCBwcm9maWxlIGdyb3VwcyBlbWFpbCIsInNpZCI6ImRiYTY1NzAxLWE3OTctMDFjZi0yYWE1LTRkYjkzY2Q0ZWM4NiIsInN1YiI6IjNlYTdiY2Q0LWJkMjMtNDA2Yy1hYmE2LWFmMzM3ZjBlMTAzNiIsInR5cCI6IkJlYXJlciJ9.keTVOmqa7NmhFGb5Jp384W0EisDdxox7Sip2f1B4MPdfN5z_tDtU85beJbBqCFl6TJdybu0PHVRX_sDW5q4Fgg'
11064os .environ ["AWS_DEFAULT_REGION" ] = 'us-east-1'
65+
66+ N_CODES = 20
67+
11168fs = s3fs .S3FileSystem (
11269 client_kwargs = {'endpoint_url' : 'https://' + 'minio.lab.sspcloud.fr' },
11370 key = os .environ ["AWS_ACCESS_KEY_ID" ],
11471 secret = os .environ ["AWS_SECRET_ACCESS_KEY" ],
11572 token = os .environ ["AWS_SESSION_TOKEN" ])
11673
11774
75+
11876def sample_codes (fs : s3fs .S3FileSystem , population_path : str , code_column : str , n_codes : int ):
11977 """
12078 Sample codes using Polars from S3.
@@ -139,10 +97,113 @@ def sample_codes(fs: s3fs.S3FileSystem, population_path: str, code_column: str,
13997 fs = fs ,
14098 population_path = path ,
14199 code_column = columns ,
142- n_codes = 10 )
100+ n_codes = N_CODES )
101+
102+ labels , target_codes = zip (* codes )
103+
104+ emb_model = OpenAIEmbeddings (
105+ model = os .environ ['EMBEDDING_MODEL' ],
106+ openai_api_base = os .environ ['URL_EMBEDDING_API' ],
107+ openai_api_key = "EMPTY" ,
108+ tiktoken_enabled = False ,
109+ )
110+
111+ labels_embeddings = emb_model .embed_documents (list (labels ))
143112
144- print (codes )
145- labels , codes = zip (* codes )
113+ label_to_code_idx = {}
114+
115+ for i , (label , label_emb , target_code ) in enumerate (zip (labels , labels_embeddings , target_codes )):
116+ embeddings .append (label_emb )
117+ names .append (label [:50 ])
118+ paths .append (f"Libellé -> Code cible: { target_code } " )
119+
120+ label_idx = n_nace_nodes + i
121+ if target_code in codes_dict :
122+ label_to_code_idx [label_idx ] = codes_dict [target_code ]
123+
124+ # %%
125+ print (label_to_code_idx )
126+
127+ # %% UMAP
128+ reducer = umap .UMAP (random_state = 42 , n_neighbors = 10 , min_dist = 0.1 )
129+ embeddings = np .array (embeddings )
130+ coords = reducer .fit_transform (embeddings )
131+ X , Y = coords .T
132+
133+
134+
135+
136+ # %% Visualisation interactive
137+ fig = go .Figure ()
138+
139+
140+ # 1. Ajouter les lignes de connexion AVANT les points
141+ for label_idx , code_idx in label_to_code_idx .items ():
142+ fig .add_trace (go .Scatter (
143+ x = [X [label_idx ], X [code_idx ]],
144+ y = [Y [label_idx ], Y [code_idx ]],
145+ mode = 'lines' ,
146+ line = dict (color = 'rgba(150, 150, 150, 0.8)' , width = 3 , dash = 'solid' ),
147+ showlegend = False ,
148+ hoverinfo = 'skip'
149+ ))
150+
151+ # 2. Ajouter les nœuds NACE (cercles)
152+ fig .add_trace (go .Scatter (
153+ x = X [:n_nace_nodes ],
154+ y = Y [:n_nace_nodes ],
155+ mode = 'markers' ,
156+ name = 'Codes NACE' ,
157+ marker = dict (
158+ size = 10 ,
159+ color = np .arange (n_nace_nodes ),
160+ colorscale = 'Viridis' ,
161+ showscale = True ,
162+ line = dict (width = 0.5 , color = 'white' ),
163+ symbol = 'circle'
164+ ),
165+ text = [f"<b>{ name } </b><br><br>{ path } " for name , path in zip (names [:n_nace_nodes ], paths [:n_nace_nodes ])],
166+ hovertemplate = '%{text}<extra></extra>'
167+ ))
168+
169+ # 3. Ajouter les libellés (étoiles)
170+ fig .add_trace (go .Scatter (
171+ x = X [n_nace_nodes :],
172+ y = Y [n_nace_nodes :],
173+ mode = 'markers' ,
174+ name = 'Libellés' ,
175+ marker = dict (
176+ size = 15 ,
177+ color = 'red' ,
178+ symbol = 'star' , # ou 'diamond', 'square', 'cross', 'x', 'triangle-up'
179+ line = dict (width = 1 , color = 'darkred' )
180+ ),
181+ text = [f"<b>{ name } </b><br><br>{ path } " for name , path in zip (names [n_nace_nodes :], paths [n_nace_nodes :])],
182+ hovertemplate = '%{text}<extra></extra>'
183+ ))
184+
185+ fig .update_layout (
186+ title = "Nœuds NACE niveau 5 et libellés échantillonnés" ,
187+ xaxis_title = "UMAP 1" ,
188+ yaxis_title = "UMAP 2" ,
189+ width = 1400 ,
190+ height = 900 ,
191+ hovermode = 'closest' ,
192+ plot_bgcolor = 'white' ,
193+ xaxis = dict (showgrid = True , gridcolor = 'lightgray' ),
194+ yaxis = dict (showgrid = True , gridcolor = 'lightgray' ),
195+ legend = dict (
196+ yanchor = "top" ,
197+ y = 0.99 ,
198+ xanchor = "right" ,
199+ x = 0.99
200+ )
201+ )
202+
203+ fig .show ()
204+
205+ # %%
206+ fig .write_html ("umap_visualization.html" )
146207
147208# %%
148209result = await classify_navigator (labels [0 ])
0 commit comments