codesearch/sql_tasks_mega.sql at main · serenedb/codesearch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
\timing on
SET http_retries = 12;
SET http_retry_wait_ms = 5000;
DROP INDEX IF EXISTS tasks_idx;
DROP VIEW IF EXISTS tasks_v;
DROP VIEW IF EXISTS tasks_cf_v;
DROP VIEW IF EXISTS tasks_cc_v;
DROP VIEW IF EXISTS tasks_mbpp_v;
DROP VIEW IF EXISTS tasks_he_v;
DROP VIEW IF EXISTS tasks_rc_v;
DROP VIEW IF EXISTS tasks_ioi_v;

CREATE VIEW tasks_cf_v AS
SELECT id, title, contest_name,
       COALESCE(rating, 0)::INTEGER AS rating,
       array_to_string(tags, ', ') AS tags,
       description AS statement,
       COALESCE(editorial, '') AS editorial
FROM read_parquet('hf://datasets/open-r1/codeforces@~parquet/default/*/*.parquet')
WHERE description IS NOT NULL;

CREATE VIEW tasks_cc_v AS
SELECT 'cc/' || row_number() OVER () AS id,
       name AS title,
       CASE source WHEN 1 THEN 'CodeChef' WHEN 3 THEN 'HackerEarth'
                   WHEN 4 THEN 'CodeJam' WHEN 5 THEN 'AtCoder'
                   WHEN 6 THEN 'AIZU' ELSE 'Other judge' END AS contest_name,
       COALESCE(cf_rating, 0)::INTEGER AS rating,
       array_to_string(cf_tags, ', ') AS tags,
       description AS statement,
       '' AS editorial
FROM read_parquet('hf://datasets/deepmind/code_contests@~parquet/default/*/*.parquet')
WHERE description IS NOT NULL AND length(description) > 50 AND source != 2;

-- MBPP: 974 authored Python problems (config 'full', all splits), reshaped
-- straight from the Hugging Face parquet -- no local file.
CREATE VIEW tasks_mbpp_v AS
SELECT 'mbpp/' || task_id::VARCHAR AS id,
       'MBPP #' || task_id::VARCHAR AS title,
       'MBPP (Mostly Basic Python Problems)' AS contest_name,
       0::INTEGER AS rating,
       'python' AS tags,
       trim(text) AS statement,
       '' AS editorial
FROM read_parquet('hf://datasets/google-research-datasets/mbpp@~parquet/full/*/*.parquet')
WHERE text IS NOT NULL;

-- HumanEval: 164 tasks; 'HumanEval/0' -> 'humaneval/0', entry_point as title.
CREATE VIEW tasks_he_v AS
SELECT 'humaneval/' || split_part(task_id, '/', 2) AS id,
       COALESCE(NULLIF(entry_point, ''), 'HumanEval #' || split_part(task_id, '/', 2)) AS title,
       'HumanEval' AS contest_name,
       0::INTEGER AS rating,
       'python' AS tags,
       trim(prompt) AS statement,
       '' AS editorial
FROM read_parquet('hf://datasets/openai/openai_humaneval@~parquet/openai_humaneval/test/*.parquet')
WHERE prompt IS NOT NULL;

-- Rosetta Code: one row per (task, language); the Python row stands in as the
-- canonical task row (QUALIFY and DISTINCT both break fast-path indexing).
-- The solutions branch derives the SAME hash-based id independently.
CREATE VIEW tasks_rc_v AS
SELECT 'rc/' || abs(hash(task_name)) AS id,
       task_name AS title,
       'Rosetta Code' AS contest_name,
       0::INTEGER AS rating,
       'rosetta-code' AS tags,
       task_description AS statement,
       '' AS editorial
FROM read_parquet('hf://datasets/christopher/rosetta-code@~parquet/default/train/*.parquet')
WHERE language_name = 'Python'
  AND task_description IS NOT NULL AND length(task_description) > 20;

-- IOI: one row per subtask; '00-samples' appears once per problem.
CREATE VIEW tasks_ioi_v AS
SELECT 'ioi/' || id || year AS id,
       name AS title,
       'IOI ' || year AS contest_name,
       0::INTEGER AS rating,
       'ioi, olympiad' AS tags,
       statement,
       '' AS editorial
FROM read_parquet('hf://datasets/open-r1/ioi@~parquet/default/train/*.parquet')
WHERE subtask = '00-samples'
  AND statement IS NOT NULL AND length(statement) > 50;

CREATE VIEW tasks_v AS
SELECT * FROM tasks_cf_v
UNION ALL SELECT * FROM tasks_cc_v
UNION ALL SELECT * FROM tasks_mbpp_v
UNION ALL SELECT * FROM tasks_he_v
UNION ALL SELECT * FROM tasks_rc_v
UNION ALL SELECT * FROM tasks_ioi_v;

CREATE INDEX tasks_idx ON tasks_v
USING inverted(id, rating, title cf_en, statement cf_en, editorial cf_en, tags cf_en)
INCLUDE (id, contest_name, rating, title, statement, editorial, tags);

SELECT count(*) AS tasks_done FROM tasks_idx;