This repository was archived by the owner on Jun 5, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 90
85 lines (70 loc) · 3.01 KB
/
import_packages.yml
File metadata and controls
85 lines (70 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# This workflow syncs the vector database
name: Sync vector DB
on:
workflow_dispatch:
jobs:
sync_db:
# The type of runner that the job will run on
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write
env:
AWS_REGION: us-east-1
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install "."
- name: Download git lfs dependencies
run: |
git lfs install
git lfs pull
- name: Configure AWS Credentials for S3
uses: aws-actions/configure-aws-credentials@f24d7193d98baebaeacc7e2227925dd47cc267f5
with:
role-to-assume: ${{ secrets.AWS_ROLE_INSIGHT_DATA_IMPORT }}
aws-region: ${{ env.AWS_REGION }}
- name: Download JSONL files from S3
run: |
echo "Downloading manifest.json from S3..."
aws s3 cp s3://codegate-data-prod/manifest.json ./manifest.json --region $AWS_REGION
echo "Manifest content:"
cat manifest.json
echo "Parsing manifest..."
MALICIOUS_KEY=$(jq -r '.latest.malicious_packages' manifest.json)
DEPRECATED_KEY=$(jq -r '.latest.deprecated_packages' manifest.json)
ARCHIVED_KEY=$(jq -r '.latest.archived_packages' manifest.json)
VULNERABLE_KEY=$(jq -r '.latest.vulnerable_packages' manifest.json)
echo "Malicious key: $MALICIOUS_KEY"
echo "Deprecated key: $DEPRECATED_KEY"
echo "Archived key: $ARCHIVED_KEY"
mkdir -p /tmp/jsonl-files
# Download and map the S3 files to fixed names in /tmp/jsonl-files
aws s3 cp s3://codegate-data-prod/$MALICIOUS_KEY /tmp/jsonl-files/malicious.jsonl --region $AWS_REGION
aws s3 cp s3://codegate-data-prod/$DEPRECATED_KEY /tmp/jsonl-files/deprecated.jsonl --region $AWS_REGION
aws s3 cp s3://codegate-data-prod/$ARCHIVED_KEY /tmp/jsonl-files/archived.jsonl --region $AWS_REGION
aws s3 cp s3://codegate-data-prod/$VULNERABLE_KEY /tmp/jsonl-files/vulnerable.jsonl --region $AWS_REGION
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
- name: Add Poetry to PATH
run: |
echo "PATH=$HOME/.poetry/bin:$PATH" >> $GITHUB_ENV
- name: Install dependencies with Poetry
run: |
poetry install
- name: 'Run import_packages.py with poetry'
run: |
poetry run python scripts/import_packages.py --jsonl-dir /tmp/jsonl-files --vec-db-path /tmp/sqlite_data/vectordb.db
- name: 'Upload SQLite Vector DB File'
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
with:
name: sqlite_data
path: /tmp/sqlite_data/vectordb.db
retention-days: 90