codegen-examples/examples/generate_training_data/run.py at 61df8432addf18746db919a574e1feddd9da1f76 · codegen-sh/codegen-examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import json

import codegen
from codegen import Codebase
from codegen.sdk.enums import ProgrammingLanguage
from codegen.sdk.core.external_module import ExternalModule
from codegen.sdk.core.import_resolution import Import
from codegen.sdk.core.symbol import Symbol


def hop_through_imports(imp: Import) -> Symbol | ExternalModule:
    """Finds the root symbol for an import"""
    if isinstance(imp.imported_symbol, Import):
        return hop_through_imports(imp.imported_symbol)
    return imp.imported_symbol


def get_function_context(function) -> dict:
    """Get the implementation, dependencies, and usages of a function."""
    context = {
        "implementation": {"source": function.source, "filepath": function.filepath},
        "dependencies": [],
        "usages": [],
    }

    # Add dependencies
    for dep in function.dependencies:
        # Hop through imports to find the root symbols source
        if isinstance(dep, Import):
            dep = hop_through_imports(dep)

        context["dependencies"].append({"source": dep.source, "filepath": dep.filepath})

    # Add usages
    for usage in function.usages:
        context["usages"].append(
            {
                "source": usage.usage_symbol.source,
                "filepath": usage.usage_symbol.filepath,
            }
        )

    return context


@codegen.function("generate-training-data")
def run(codebase: Codebase):
    """Generate training data using a node2vec-like approach for code embeddings.

    This codemod:
    1. Finds all functions in the codebase
    2. For each function:
       - Captures its implementation
       - Lists all dependencies (with their implementations)
       - Lists all usages (with their implementations)
    3. Outputs structured JSON data for training
    """
    # Track all function contexts
    training_data = {
        "functions": [],
        "metadata": {
            "total_functions": len(codebase.functions),
            "total_processed": 0,
            "avg_dependencies": 0,
            "avg_usages": 0,
        },
    }

    # Process each function in the codebase
    for function in codebase.functions:
        # Skip if function is too small
        if len(function.source.split("\n")) < 2:
            continue

        # Get function context
        context = get_function_context(function)

        # Only keep functions with enough context
        if len(context["dependencies"]) + len(context["usages"]) > 0:
            training_data["functions"].append(context)

    # Update metadata
    training_data["metadata"]["total_processed"] = len(training_data["functions"])
    if training_data["functions"]:
        training_data["metadata"]["avg_dependencies"] = sum(len(f["dependencies"]) for f in training_data["functions"]) / len(training_data["functions"])
        training_data["metadata"]["avg_usages"] = sum(len(f["usages"]) for f in training_data["functions"]) / len(training_data["functions"])

    # Print stats
    print(f"Processed {training_data['metadata']['total_processed']} functions")
    print(f"Average dependencies: {training_data['metadata']['avg_dependencies']:.2f}")
    print(f"Average usages: {training_data['metadata']['avg_usages']:.2f}")

    return training_data


if __name__ == "__main__":
    print("Initializing codebase...")
    codebase = Codebase.from_repo("fastapi/fastapi", commit="887270ff8a54bb58c406b0651678a27589793d2f", programming_language=ProgrammingLanguage.PYTHON)

    print("Generating training data...")
    training_data = run(codebase)

    print("Saving training data...")
    with open("training_data.json", "w") as f:
        json.dump(training_data, f, indent=2)
    print("Training data saved to training_data.json")