-
Notifications
You must be signed in to change notification settings - Fork 25
Expand file tree
/
Copy pathcode_extractor.py
More file actions
1070 lines (867 loc) · 44 KB
/
code_extractor.py
File metadata and controls
1070 lines (867 loc) · 44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from __future__ import annotations
import ast
import time
from importlib.util import find_spec
from itertools import chain
from typing import TYPE_CHECKING, Optional
import libcst as cst
from libcst.codemod import CodemodContext
from libcst.codemod.visitors import AddImportsVisitor, GatherImportsVisitor, RemoveImportsVisitor
from libcst.helpers import calculate_module_and_package
from codeflash.cli_cmds.console import logger
from codeflash.code_utils.config_consts import MAX_CONTEXT_LEN_REVIEW
from codeflash.languages.base import Language
from codeflash.models.models import FunctionParent
if TYPE_CHECKING:
from pathlib import Path
from libcst.helpers import ModuleNameAndPackage
from codeflash.discovery.functions_to_optimize import FunctionToOptimize
from codeflash.models.models import FunctionSource
class GlobalFunctionCollector(cst.CSTVisitor):
"""Collects all module-level function definitions (not inside classes or other functions)."""
def __init__(self) -> None:
super().__init__()
self.functions: dict[str, cst.FunctionDef] = {}
self.function_order: list[str] = []
def visit_FunctionDef(self, node: cst.FunctionDef) -> Optional[bool]:
name = node.name.value
self.functions[name] = node
if name not in self.function_order:
self.function_order.append(name)
return False
def visit_ClassDef(self, node: cst.ClassDef) -> Optional[bool]:
return False
class GlobalFunctionTransformer(cst.CSTTransformer):
"""Transforms/adds module-level functions from the new file to the original file."""
def __init__(self, new_functions: dict[str, cst.FunctionDef], new_function_order: list[str]) -> None:
super().__init__()
self.new_functions = new_functions
self.new_function_order = new_function_order
self.processed_functions: set[str] = set()
def visit_FunctionDef(self, node: cst.FunctionDef) -> bool:
return False
def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef) -> cst.FunctionDef:
name = original_node.name.value
if name in self.new_functions:
self.processed_functions.add(name)
return self.new_functions[name]
return updated_node
def visit_ClassDef(self, node: cst.ClassDef) -> bool:
return False
def leave_Module(self, original_node: cst.Module, updated_node: cst.Module) -> cst.Module:
# Add any new functions that weren't in the original file
new_statements = list(updated_node.body)
functions_to_append = [
self.new_functions[name]
for name in self.new_function_order
if name not in self.processed_functions and name in self.new_functions
]
if functions_to_append:
# Find the position of the last function or class definition
insert_index = find_insertion_index_after_imports(updated_node)
for i, stmt in enumerate(new_statements):
if isinstance(stmt, (cst.FunctionDef, cst.ClassDef)):
insert_index = i + 1
# Add empty line before each new function
function_nodes = []
for func in functions_to_append:
func_with_empty_line = func.with_changes(leading_lines=[cst.EmptyLine(), *func.leading_lines])
function_nodes.append(func_with_empty_line)
new_statements = list(chain(new_statements[:insert_index], function_nodes, new_statements[insert_index:]))
return updated_node.with_changes(body=new_statements)
def collect_referenced_names(node: cst.CSTNode) -> set[str]:
"""Collect all names referenced in a CST node using recursive traversal."""
names: set[str] = set()
def _collect(n: cst.CSTNode) -> None:
if isinstance(n, cst.Name):
names.add(n.value)
# Recursively process all children
for child in n.children:
_collect(child)
_collect(node)
return names
class GlobalAssignmentCollector(cst.CSTVisitor):
"""Collects all global assignment statements."""
def __init__(self) -> None:
super().__init__()
self.assignments: dict[str, cst.Assign | cst.AnnAssign] = {}
self.assignment_order: list[str] = []
self.if_else_depth = 0
def visit_FunctionDef(self, node: cst.FunctionDef) -> Optional[bool]:
return False
def visit_ClassDef(self, node: cst.ClassDef) -> Optional[bool]:
return False
def visit_If(self, node: cst.If) -> Optional[bool]:
self.if_else_depth += 1
return True
def leave_If(self, original_node: cst.If) -> None:
self.if_else_depth -= 1
def visit_Assign(self, node: cst.Assign) -> Optional[bool]:
if self.if_else_depth == 0:
for target in node.targets:
if isinstance(target.target, cst.Name):
name = target.target.value
self.assignments[name] = node
if name not in self.assignment_order:
self.assignment_order.append(name)
return True
def visit_AnnAssign(self, node: cst.AnnAssign) -> Optional[bool]:
if self.if_else_depth == 0 and isinstance(node.target, cst.Name) and node.value is not None:
name = node.target.value
self.assignments[name] = node
if name not in self.assignment_order:
self.assignment_order.append(name)
return True
def find_insertion_index_after_imports(node: cst.Module) -> int:
"""Find the position of the last import statement in the top-level of the module."""
insert_index = 0
for i, stmt in enumerate(node.body):
is_top_level_import = isinstance(stmt, cst.SimpleStatementLine) and any(
isinstance(child, (cst.Import, cst.ImportFrom)) for child in stmt.body
)
is_conditional_import = isinstance(stmt, cst.If) and all(
isinstance(inner, cst.SimpleStatementLine)
and all(isinstance(child, (cst.Import, cst.ImportFrom)) for child in inner.body)
for inner in stmt.body.body
)
if is_top_level_import or is_conditional_import:
insert_index = i + 1
# Stop scanning once we reach a class or function definition.
# Imports are supposed to be at the top of the file, but they can technically appear anywhere, even at the bottom of the file.
# Without this check, a stray import later in the file
# would incorrectly shift our insertion index below actual code definitions.
if isinstance(stmt, (cst.ClassDef, cst.FunctionDef)):
break
return insert_index
class GlobalAssignmentTransformer(cst.CSTTransformer):
"""Transforms global assignments in the original file with those from the new file."""
def __init__(self, new_assignments: dict[str, cst.Assign | cst.AnnAssign], new_assignment_order: list[str]) -> None:
super().__init__()
self.new_assignments = new_assignments
self.new_assignment_order = new_assignment_order
self.processed_assignments: set[str] = set()
self.if_else_depth = 0
def visit_FunctionDef(self, node: cst.FunctionDef) -> bool:
return False
def visit_ClassDef(self, node: cst.ClassDef) -> bool:
return False
def visit_If(self, node: cst.If) -> None:
self.if_else_depth += 1
def leave_If(self, original_node: cst.If, updated_node: cst.If) -> cst.If:
self.if_else_depth -= 1
return updated_node
def leave_Assign(self, original_node: cst.Assign, updated_node: cst.Assign) -> cst.CSTNode:
if self.if_else_depth > 0:
return updated_node
# Check if this is a global assignment we need to replace
for target in original_node.targets:
if isinstance(target.target, cst.Name):
name = target.target.value
if name in self.new_assignments:
self.processed_assignments.add(name)
return self.new_assignments[name]
return updated_node
def leave_AnnAssign(self, original_node: cst.AnnAssign, updated_node: cst.AnnAssign) -> cst.CSTNode:
if self.if_else_depth > 0:
return updated_node
# Check if this is a global annotated assignment we need to replace
if isinstance(original_node.target, cst.Name):
name = original_node.target.value
if name in self.new_assignments:
self.processed_assignments.add(name)
return self.new_assignments[name]
return updated_node
def leave_Module(self, original_node: cst.Module, updated_node: cst.Module) -> cst.Module:
# Add any new assignments that weren't in the original file
new_statements = list(updated_node.body)
# Find assignments to append
assignments_to_append = [
(name, self.new_assignments[name])
for name in self.new_assignment_order
if name not in self.processed_assignments and name in self.new_assignments
]
if not assignments_to_append:
return updated_node.with_changes(body=new_statements)
# Collect all class and function names defined in the module
# These are the names that assignments might reference
module_defined_names: set[str] = set()
for stmt in new_statements:
if isinstance(stmt, (cst.ClassDef, cst.FunctionDef)):
module_defined_names.add(stmt.name.value)
# Partition assignments: those that reference module definitions go at the end,
# those that don't can go right after imports
assignments_after_imports: list[tuple[str, cst.Assign | cst.AnnAssign]] = []
assignments_after_definitions: list[tuple[str, cst.Assign | cst.AnnAssign]] = []
for name, assignment in assignments_to_append:
# Get the value being assigned
if isinstance(assignment, (cst.Assign, cst.AnnAssign)) and assignment.value is not None:
value_node = assignment.value
else:
# No value to analyze, safe to place after imports
assignments_after_imports.append((name, assignment))
continue
# Collect names referenced in the assignment value
referenced_names = collect_referenced_names(value_node)
# Check if any referenced names are module-level definitions
if referenced_names & module_defined_names:
# This assignment references a class/function, place it after definitions
assignments_after_definitions.append((name, assignment))
else:
# Safe to place right after imports
assignments_after_imports.append((name, assignment))
# Insert assignments that don't depend on module definitions right after imports
if assignments_after_imports:
insert_index = find_insertion_index_after_imports(updated_node)
assignment_lines = [
cst.SimpleStatementLine([assignment], leading_lines=[cst.EmptyLine()])
for _, assignment in assignments_after_imports
]
new_statements = list(chain(new_statements[:insert_index], assignment_lines, new_statements[insert_index:]))
# Insert assignments that depend on module definitions after all class/function definitions
if assignments_after_definitions:
# Find the position after the last function or class definition
insert_index = find_insertion_index_after_imports(cst.Module(body=new_statements))
for i, stmt in enumerate(new_statements):
if isinstance(stmt, (cst.FunctionDef, cst.ClassDef)):
insert_index = i + 1
assignment_lines = [
cst.SimpleStatementLine([assignment], leading_lines=[cst.EmptyLine()])
for _, assignment in assignments_after_definitions
]
new_statements = list(chain(new_statements[:insert_index], assignment_lines, new_statements[insert_index:]))
return updated_node.with_changes(body=new_statements)
class GlobalStatementTransformer(cst.CSTTransformer):
"""Appends global statements at the end of the module. Run LAST after other transformers."""
def __init__(self, global_statements: list[cst.SimpleStatementLine]) -> None:
super().__init__()
self.global_statements = global_statements
def leave_Module(self, original_node: cst.Module, updated_node: cst.Module) -> cst.Module:
if not self.global_statements:
return updated_node
new_statements = list(updated_node.body)
# Add empty line before each statement for readability
statement_lines = [
stmt.with_changes(leading_lines=[cst.EmptyLine(), *stmt.leading_lines]) for stmt in self.global_statements
]
# Append statements at the end of the module
# This ensures they come after all functions, classes, and assignments
new_statements.extend(statement_lines)
return updated_node.with_changes(body=new_statements)
class GlobalStatementCollector(cst.CSTVisitor):
"""Collects module-level statements (excluding imports, assignments, functions and classes)."""
def __init__(self) -> None:
super().__init__()
self.global_statements: list[cst.SimpleStatementLine] = []
def visit_ClassDef(self, node: cst.ClassDef) -> bool:
return False
def visit_FunctionDef(self, node: cst.FunctionDef) -> bool:
return False
def visit_SimpleStatementLine(self, node: cst.SimpleStatementLine) -> None:
for statement in node.body:
if not isinstance(statement, (cst.Import, cst.ImportFrom, cst.Assign, cst.AnnAssign)):
self.global_statements.append(node)
break
class DottedImportCollector(cst.CSTVisitor):
"""Collects top-level imports as normalized dotted strings (e.g. 'from pathlib import Path' -> 'pathlib.Path')."""
def __init__(self) -> None:
self.imports: set[str] = set()
def get_full_dotted_name(self, expr: cst.BaseExpression) -> str:
if isinstance(expr, cst.Name):
return expr.value
if isinstance(expr, cst.Attribute):
return f"{self.get_full_dotted_name(expr.value)}.{expr.attr.value}"
return ""
def _collect_imports_from_block(self, block: cst.IndentedBlock) -> None:
for statement in block.body:
if isinstance(statement, cst.SimpleStatementLine):
for child in statement.body:
if isinstance(child, cst.Import):
for alias in child.names:
module = self.get_full_dotted_name(alias.name)
asname = alias.asname.name.value if alias.asname else alias.name.value
if isinstance(asname, cst.Attribute):
self.imports.add(module)
else:
self.imports.add(module if module == asname else f"{module}.{asname}")
elif isinstance(child, cst.ImportFrom):
if child.module is None:
continue
module = self.get_full_dotted_name(child.module)
if isinstance(child.names, cst.ImportStar):
continue
for alias in child.names:
if isinstance(alias, cst.ImportAlias):
name = alias.name.value
asname = alias.asname.name.value if alias.asname else name
self.imports.add(f"{module}.{asname}")
def visit_Module(self, node: cst.Module) -> None:
self._collect_imports_from_block(node)
def visit_FunctionDef(self, node: cst.FunctionDef) -> bool:
return False
def visit_ClassDef(self, node: cst.ClassDef) -> bool:
return False
def visit_If(self, node: cst.If) -> None:
self._collect_imports_from_block(node.body)
def visit_Try(self, node: cst.Try) -> None:
self._collect_imports_from_block(node.body)
def extract_global_statements(source_code: str) -> tuple[cst.Module, list[cst.SimpleStatementLine]]:
"""Extract global statements from source code."""
module = cst.parse_module(source_code)
collector = GlobalStatementCollector()
module.visit(collector)
return module, collector.global_statements
class FutureAliasedImportTransformer(cst.CSTTransformer):
def leave_ImportFrom(
self, original_node: cst.ImportFrom, updated_node: cst.ImportFrom
) -> cst.BaseSmallStatement | cst.FlattenSentinel[cst.BaseSmallStatement] | cst.RemovalSentinel:
import libcst.matchers as m
if (
(updated_node_module := updated_node.module)
and updated_node_module.value == "__future__"
and all(m.matches(name, m.ImportAlias()) for name in updated_node.names)
):
if names := [name for name in updated_node.names if name.asname is None]:
return updated_node.with_changes(names=names)
return cst.RemoveFromParent()
return updated_node
def delete___future___aliased_imports(module_code: str) -> str:
return cst.parse_module(module_code).visit(FutureAliasedImportTransformer()).code
def add_global_assignments(src_module_code: str, dst_module_code: str) -> str:
src_module, new_added_global_statements = extract_global_statements(src_module_code)
dst_module, existing_global_statements = extract_global_statements(dst_module_code)
unique_global_statements = []
for stmt in new_added_global_statements:
if any(
stmt is existing_stmt or stmt.deep_equals(existing_stmt) for existing_stmt in existing_global_statements
):
continue
unique_global_statements.append(stmt)
new_assignment_collector = GlobalAssignmentCollector()
src_module.visit(new_assignment_collector)
# Collect module-level functions from both source and destination
src_function_collector = GlobalFunctionCollector()
src_module.visit(src_function_collector)
dst_function_collector = GlobalFunctionCollector()
dst_module.visit(dst_function_collector)
# Filter out functions that already exist in the destination (only add truly new functions)
new_functions = {
name: func
for name, func in src_function_collector.functions.items()
if name not in dst_function_collector.functions
}
new_function_order = [name for name in src_function_collector.function_order if name in new_functions]
if not new_assignment_collector.assignments and not new_functions and not unique_global_statements:
return dst_module_code
# Transform in order: functions, then assignments, then global statements (so each can reference the previous)
if new_functions:
dst_module = dst_module.visit(GlobalFunctionTransformer(new_functions, new_function_order))
if new_assignment_collector.assignments:
dst_module = dst_module.visit(
GlobalAssignmentTransformer(new_assignment_collector.assignments, new_assignment_collector.assignment_order)
)
if unique_global_statements:
dst_module = dst_module.visit(GlobalStatementTransformer(unique_global_statements))
return dst_module.code
def resolve_star_import(module_name: str, project_root: Path) -> set[str]:
try:
module_path = module_name.replace(".", "/")
possible_paths = [project_root / f"{module_path}.py", project_root / f"{module_path}/__init__.py"]
module_file = None
for path in possible_paths:
if path.exists():
module_file = path
break
if module_file is None:
logger.warning(f"Could not find module file for {module_name}, skipping star import resolution")
return set()
with module_file.open(encoding="utf8") as f:
module_code = f.read()
tree = ast.parse(module_code)
all_names = None
for node in ast.walk(tree):
if (
isinstance(node, ast.Assign)
and len(node.targets) == 1
and isinstance(node.targets[0], ast.Name)
and node.targets[0].id == "__all__"
):
if isinstance(node.value, (ast.List, ast.Tuple)):
all_names = []
for elt in node.value.elts:
if isinstance(elt, ast.Constant) and isinstance(elt.value, str):
all_names.append(elt.value)
break
if all_names is not None:
return set(all_names)
public_names = set()
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
if not node.name.startswith("_"):
public_names.add(node.name)
elif isinstance(node, ast.Assign):
for target in node.targets:
if isinstance(target, ast.Name) and not target.id.startswith("_"):
public_names.add(target.id)
elif isinstance(node, ast.AnnAssign):
if isinstance(node.target, ast.Name) and not node.target.id.startswith("_"):
public_names.add(node.target.id)
elif isinstance(node, ast.Import) or (
isinstance(node, ast.ImportFrom) and not any(alias.name == "*" for alias in node.names)
):
for alias in node.names:
name = alias.asname or alias.name
if not name.startswith("_"):
public_names.add(name)
return public_names
except Exception as e:
logger.warning(f"Error resolving star import for {module_name}: {e}")
return set()
def add_needed_imports_from_module(
src_module_code: str | cst.Module,
dst_module_code: str | cst.Module,
src_path: Path,
dst_path: Path,
project_root: Path,
helper_functions: list[FunctionSource] | None = None,
helper_functions_fqn: set[str] | None = None,
) -> str:
"""Add all needed and used source module code imports to the destination module code, and return it."""
if not helper_functions_fqn:
helper_functions_fqn = {f.fully_qualified_name for f in (helper_functions or [])}
# Cache the fallback early to avoid repeated isinstance checks
if isinstance(dst_module_code, str):
dst_code_fallback = dst_module_code
parsed_dst_module = None
else:
dst_code_fallback = dst_module_code.code
parsed_dst_module = dst_module_code
src_module_and_package: ModuleNameAndPackage = calculate_module_and_package(project_root, src_path)
dst_module_and_package: ModuleNameAndPackage = calculate_module_and_package(project_root, dst_path)
dst_context: CodemodContext = CodemodContext(
filename=src_path.name,
full_module_name=dst_module_and_package.name,
full_package_name=dst_module_and_package.package,
)
try:
if isinstance(src_module_code, cst.Module):
src_module = src_module_code.visit(FutureAliasedImportTransformer())
else:
src_module = cst.parse_module(src_module_code).visit(FutureAliasedImportTransformer())
# Early exit: check if source has any imports at module level
has_module_level_imports = any(
isinstance(stmt, (cst.Import, cst.ImportFrom))
for stmt in src_module.body
if isinstance(stmt, cst.SimpleStatementLine)
for s in stmt.body
if isinstance(s, (cst.Import, cst.ImportFrom))
)
if not has_module_level_imports:
return dst_code_fallback
gatherer: GatherImportsVisitor = GatherImportsVisitor(
CodemodContext(
filename=src_path.name,
full_module_name=src_module_and_package.name,
full_package_name=src_module_and_package.package,
)
)
# Exclude function/class bodies so GatherImportsVisitor only sees module-level imports.
# Nested imports (inside functions) are part of function logic and must not be
# scheduled for add/remove — RemoveImportsVisitor would strip them as "unused".
module_level_only = src_module.with_changes(
body=[stmt for stmt in src_module.body if not isinstance(stmt, (cst.FunctionDef, cst.ClassDef))]
)
module_level_only.visit(gatherer)
# Early exit: if no imports were gathered, return destination as-is
if (
not gatherer.module_imports
and not gatherer.object_mapping
and not gatherer.module_aliases
and not gatherer.alias_mapping
):
return dst_code_fallback
except Exception as e:
logger.error(f"Error parsing source module code: {e}")
return dst_code_fallback
dotted_import_collector = DottedImportCollector()
if parsed_dst_module is None:
try:
parsed_dst_module = cst.parse_module(dst_module_code)
except cst.ParserSyntaxError as e:
logger.exception(f"Syntax error in destination module code: {e}")
return dst_code_fallback
parsed_dst_module.visit(dotted_import_collector)
try:
for mod in gatherer.module_imports:
# Skip __future__ imports as they cannot be imported directly
# __future__ imports should only be imported with specific objects i.e from __future__ import annotations
if mod == "__future__":
continue
if mod not in dotted_import_collector.imports:
AddImportsVisitor.add_needed_import(dst_context, mod)
RemoveImportsVisitor.remove_unused_import(dst_context, mod)
aliased_objects = set()
for mod, alias_pairs in gatherer.alias_mapping.items():
for alias_pair in alias_pairs:
if alias_pair[0] and alias_pair[1]: # Both name and alias exist
aliased_objects.add(f"{mod}.{alias_pair[0]}")
for mod, obj_seq in gatherer.object_mapping.items():
for obj in obj_seq:
if (
f"{mod}.{obj}" in helper_functions_fqn or dst_context.full_module_name == mod # avoid circular deps
):
continue # Skip adding imports for helper functions already in the context
if f"{mod}.{obj}" in aliased_objects:
continue
# Handle star imports by resolving them to actual symbol names
if obj == "*":
resolved_symbols = resolve_star_import(mod, project_root)
logger.debug(f"Resolved star import from {mod}: {resolved_symbols}")
for symbol in resolved_symbols:
if (
f"{mod}.{symbol}" not in helper_functions_fqn
and f"{mod}.{symbol}" not in dotted_import_collector.imports
):
AddImportsVisitor.add_needed_import(dst_context, mod, symbol)
RemoveImportsVisitor.remove_unused_import(dst_context, mod, symbol)
else:
if f"{mod}.{obj}" not in dotted_import_collector.imports:
AddImportsVisitor.add_needed_import(dst_context, mod, obj)
RemoveImportsVisitor.remove_unused_import(dst_context, mod, obj)
except Exception as e:
logger.exception(f"Error adding imports to destination module code: {e}")
return dst_code_fallback
for mod, asname in gatherer.module_aliases.items():
if not asname:
continue
if f"{mod}.{asname}" not in dotted_import_collector.imports:
AddImportsVisitor.add_needed_import(dst_context, mod, asname=asname)
RemoveImportsVisitor.remove_unused_import(dst_context, mod, asname=asname)
for mod, alias_pairs in gatherer.alias_mapping.items():
for alias_pair in alias_pairs:
if f"{mod}.{alias_pair[0]}" in helper_functions_fqn:
continue
if not alias_pair[0] or not alias_pair[1]:
continue
if f"{mod}.{alias_pair[1]}" not in dotted_import_collector.imports:
AddImportsVisitor.add_needed_import(dst_context, mod, alias_pair[0], asname=alias_pair[1])
RemoveImportsVisitor.remove_unused_import(dst_context, mod, alias_pair[0], asname=alias_pair[1])
try:
add_imports_visitor = AddImportsVisitor(dst_context)
transformed_module = add_imports_visitor.transform_module(parsed_dst_module)
transformed_module = RemoveImportsVisitor(dst_context).transform_module(transformed_module)
return transformed_module.code.lstrip("\n")
except Exception as e:
logger.exception(f"Error adding imports to destination module code: {e}")
return dst_code_fallback
def get_code(functions_to_optimize: list[FunctionToOptimize]) -> tuple[str | None, set[tuple[str, str]]]:
"""Return the code for a function or methods in a Python module.
functions_to_optimize is either a singleton FunctionToOptimize instance, which represents either a function at the
module level or a method of a class at the module level, or it represents a list of methods of the same class.
"""
if (
not functions_to_optimize
or (functions_to_optimize[0].parents and functions_to_optimize[0].parents[0].type != "ClassDef")
or (
len(functions_to_optimize[0].parents) > 1
or ((len(functions_to_optimize) > 1) and len({fn.parents[0] for fn in functions_to_optimize}) != 1)
)
):
return None, set()
file_path: Path = functions_to_optimize[0].file_path
class_skeleton: set[tuple[int, int | None]] = set()
contextual_dunder_methods: set[tuple[str, str]] = set()
target_code: str = ""
def find_target(node_list: list[ast.stmt], name_parts: tuple[str, str] | tuple[str]) -> ast.AST | None:
target: ast.FunctionDef | ast.AsyncFunctionDef | ast.ClassDef | ast.Assign | ast.AnnAssign | None = None
node: ast.stmt
for node in node_list:
if (
# The many mypy issues will be fixed once this code moves to the backend,
# using Type Guards as we move to 3.10+.
# We will cover the Type Alias case on the backend since it's a 3.12 feature.
isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)) and node.name == name_parts[0]
):
target = node
break
# The next two cases cover type aliases in pre-3.12 syntax, where only single assignment is allowed.
if (
isinstance(node, ast.Assign)
and len(node.targets) == 1
and isinstance(node.targets[0], ast.Name)
and node.targets[0].id == name_parts[0]
) or (isinstance(node, ast.AnnAssign) and hasattr(node.target, "id") and node.target.id == name_parts[0]):
if class_skeleton:
break
target = node
break
if target is None or len(name_parts) == 1:
return target
if not isinstance(target, ast.ClassDef) or len(name_parts) < 2:
return None
# At this point, name_parts has at least 2 elements
method_name: str = name_parts[1] # type: ignore[misc]
class_skeleton.add((target.lineno, target.body[0].lineno - 1))
cbody = target.body
if isinstance(cbody[0], ast.expr): # Is a docstring
class_skeleton.add((cbody[0].lineno, cbody[0].end_lineno))
cbody = cbody[1:]
cnode: ast.stmt
for cnode in cbody:
# Collect all dunder methods.
cnode_name: str
if (
isinstance(cnode, (ast.FunctionDef, ast.AsyncFunctionDef))
and len(cnode_name := cnode.name) > 4
and cnode_name != method_name
and cnode_name.isascii()
and cnode_name.startswith("__")
and cnode_name.endswith("__")
):
contextual_dunder_methods.add((target.name, cnode_name))
class_skeleton.add((cnode.lineno, cnode.end_lineno))
return find_target(target.body, (method_name,))
with file_path.open(encoding="utf8") as file:
source_code: str = file.read()
try:
module_node: ast.Module = ast.parse(source_code)
except SyntaxError:
logger.exception("get_code - Syntax error while parsing code")
return None, set()
# Get the source code lines for the target node
lines: list[str] = source_code.splitlines(keepends=True)
if len(functions_to_optimize[0].parents) == 1:
if (
functions_to_optimize[0].parents[0].type == "ClassDef"
): # All functions_to_optimize functions are methods of the same class.
qualified_name_parts_list: list[tuple[str, str] | tuple[str]] = [
(fto.parents[0].name, fto.function_name) for fto in functions_to_optimize
]
else:
logger.error(f"Error: get_code does not support inner functions: {functions_to_optimize[0].parents}")
return None, set()
elif len(functions_to_optimize[0].parents) == 0:
qualified_name_parts_list = [(functions_to_optimize[0].function_name,)]
else:
logger.error(
"Error: get_code does not support more than one level of nesting for now. "
f"Parents: {functions_to_optimize[0].parents}"
)
return None, set()
for qualified_name_parts in qualified_name_parts_list:
target_node = find_target(module_node.body, qualified_name_parts)
if target_node is None:
continue
# find_target returns FunctionDef, AsyncFunctionDef, ClassDef, Assign, or AnnAssign - all have lineno/end_lineno
if not isinstance(
target_node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Assign, ast.AnnAssign)
):
continue
if (
isinstance(target_node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef))
and target_node.decorator_list
):
target_code += "".join(lines[target_node.decorator_list[0].lineno - 1 : target_node.end_lineno])
else:
target_code += "".join(lines[target_node.lineno - 1 : target_node.end_lineno])
if not target_code:
return None, set()
class_list: list[tuple[int, int | None]] = sorted(class_skeleton)
class_code = "".join(["".join(lines[s_lineno - 1 : e_lineno]) for (s_lineno, e_lineno) in class_list])
return class_code + target_code, contextual_dunder_methods
def find_preexisting_objects(source_code: str) -> set[tuple[str, tuple[FunctionParent, ...]]]:
"""Find all preexisting functions, classes or class methods in the source code."""
preexisting_objects: set[tuple[str, tuple[FunctionParent, ...]]] = set()
try:
module_node: ast.Module = ast.parse(source_code)
except SyntaxError:
logger.exception("find_preexisting_objects - Syntax error while parsing code")
return preexisting_objects
for node in module_node.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
preexisting_objects.add((node.name, ()))
elif isinstance(node, ast.ClassDef):
preexisting_objects.add((node.name, ()))
for cnode in node.body:
if isinstance(cnode, (ast.FunctionDef, ast.AsyncFunctionDef)):
preexisting_objects.add((cnode.name, (FunctionParent(node.name, "ClassDef"),)))
return preexisting_objects
has_numba = find_spec("numba") is not None
NUMERICAL_MODULES = frozenset({"numpy", "torch", "numba", "jax", "tensorflow", "math", "scipy"})
# Modules that require numba to be installed for optimization
NUMBA_REQUIRED_MODULES = frozenset({"numpy", "math", "scipy"})
def _uses_numerical_names(node: ast.AST, numerical_names: set[str]) -> bool:
return any(isinstance(n, ast.Name) and n.id in numerical_names for n in ast.walk(node))
def _collect_numerical_imports(tree: ast.Module) -> tuple[set[str], set[str]]:
numerical_names: set[str] = set()
modules_used: set[str] = set()
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
module_root = alias.name.split(".")[0]
if module_root in NUMERICAL_MODULES:
numerical_names.add(alias.asname if alias.asname else module_root)
modules_used.add(module_root)
elif isinstance(node, ast.ImportFrom) and node.module:
module_root = node.module.split(".")[0]
if module_root in NUMERICAL_MODULES:
for alias in node.names:
if alias.name == "*":
numerical_names.add(module_root)
else:
numerical_names.add(alias.asname if alias.asname else alias.name)
modules_used.add(module_root)
return numerical_names, modules_used
def _find_function_node(tree: ast.Module, name_parts: list[str]) -> ast.FunctionDef | None:
"""Find a function node in the AST given its qualified name parts (e.g. ["ClassName", "method"] or ["func"])."""
if not name_parts or len(name_parts) > 2:
return None
body: list[ast.stmt] = tree.body
for part in name_parts[:-1]:
for node in body:
if isinstance(node, ast.ClassDef) and node.name == part:
body = node.body
break
else:
return None
for node in body:
if isinstance(node, ast.FunctionDef) and node.name == name_parts[-1]:
return node
return None
def is_numerical_code(code_string: str, function_name: str | None = None) -> bool:
"""Check if a function uses numerical computing libraries (numpy, torch, numba, jax, tensorflow, scipy, math).
Returns False for math/numpy/scipy if numba is not installed.
"""
try:
tree = ast.parse(code_string)
except SyntaxError:
return False
# Collect names that reference numerical modules from imports
numerical_names, modules_used = _collect_numerical_imports(tree)
if not function_name:
# Return True if modules used and (numba available or modules don't all require numba)
return bool(modules_used) and (has_numba or not modules_used.issubset(NUMBA_REQUIRED_MODULES))
# Split the function name to handle class methods
name_parts = function_name.split(".")
# Find the target function node
target_function = _find_function_node(tree, name_parts)
if target_function is None:
return False
if not _uses_numerical_names(target_function, numerical_names):
return False
# If numba is not installed and all modules used require numba for optimization,
# return False since we can't optimize this code
return not (not has_numba and modules_used.issubset(NUMBA_REQUIRED_MODULES))
def get_opt_review_metrics(
source_code: str, file_path: Path, qualified_name: str, project_root: Path, tests_root: Path, language: Language
) -> str:
"""Get markdown-formatted calling function context for optimization review."""
from codeflash.discovery.functions_to_optimize import FunctionToOptimize
from codeflash.languages.registry import get_language_support
from codeflash.models.models import FunctionParent
start_time = time.perf_counter()
try:
# Get the language support
lang_support = get_language_support(language)
if lang_support is None:
return ""
# Parse qualified name to get function name and class name
qualified_name_split = qualified_name.rsplit(".", maxsplit=1)
if len(qualified_name_split) == 1:
function_name, class_name = qualified_name_split[0], None
else:
function_name, class_name = qualified_name_split[1], qualified_name_split[0]
# Create a FunctionToOptimize for the function
# We don't have full line info here, so we'll use defaults
parents: list[FunctionParent] = []
if class_name:
parents = [FunctionParent(name=class_name, type="ClassDef")]
func_info = FunctionToOptimize(
function_name=function_name,
file_path=file_path,
parents=parents,
starting_line=1,
ending_line=1,
language=str(language),
)
# Find references using language support
references = lang_support.find_references(func_info, project_root, tests_root, max_files=500)
if not references:
return ""
# Format references as markdown code blocks
calling_fns_details = _format_references_as_markdown(references, file_path, project_root, language)
except Exception as e:
logger.debug(f"Error getting function references: {e}")
calling_fns_details = ""
end_time = time.perf_counter()
logger.debug(f"Got function references in {end_time - start_time:.2f} seconds")
return calling_fns_details
def _format_references_as_markdown(references: list, file_path: Path, project_root: Path, language: Language) -> str:
"""Format references as markdown code blocks with calling function code."""
# Group references by file
refs_by_file: dict[Path, list] = {}
for ref in references:
# Exclude the source file's definition/import references
if ref.file_path == file_path and ref.reference_type in ("import", "reexport"):
continue
if ref.file_path not in refs_by_file:
refs_by_file[ref.file_path] = []
refs_by_file[ref.file_path].append(ref)