codesearch/load_codeforces.sql at main · serenedb/codesearch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
-- Codeforces corpus as PURE VIEWS over the Hugging Face parquet -- no COPY, no
-- rocksdb tables. All reshaping (rename, cast, filter, computed code_len, a
-- generated row_number() id) happens in the view; the index INCLUDEs every
-- column, so after the build the index is self-sufficient (it never re-reads
-- the parquet, even across restarts).
--
--   tasks      <- open-r1/codeforces             (statements, CC-BY-4.0)
--   solutions  <- open-r1/codeforces-submissions  (accepted code, CC-BY-4.0)
--
-- Scale/coverage controlled by the submission glob + an EVEN hash sample
-- (psql vars). A LIMIT would pile onto a few early problems; hash(submission_id)
-- % sample_mod = 0 samples ~1/sample_mod uniformly across ALL problems.
--   -v subs_glob='/abs/data/subs/*.parquet'   -v sample_mod=10

\timing on
SET http_retries = 10;
SET http_retry_wait_ms = 3000;

DROP INDEX IF EXISTS tasks_idx;
DROP INDEX IF EXISTS solutions_idx;
DROP VIEW IF EXISTS tasks_v;
DROP VIEW IF EXISTS solutions_v;
DROP TABLE IF EXISTS tasks;
DROP TABLE IF EXISTS solutions;
DROP TEXT SEARCH DICTIONARY IF EXISTS cf_en;
DROP TEXT SEARCH DICTIONARY IF EXISTS code_grams;
DROP TEXT SEARCH DICTIONARY IF EXISTS code_grams_q;

CREATE TEXT SEARCH DICTIONARY cf_en(
    template='text', locale='en_US.UTF-8', case='lower',
    stemming=true, accent=false, frequency=true, position=true, norm=true);
CREATE TEXT SEARCH DICTIONARY code_grams(
    template='sparse_ngram', frequency=true, norm=true);
CREATE TEXT SEARCH DICTIONARY code_grams_q(
    template='sparse_ngram', covering=true);

-- TASKS: reshape statements in the view (rename description->statement,
-- flatten tags list -> string, coalesce nulls). id is already "1/A".
CREATE VIEW tasks_v AS
SELECT id,
       title,
       contest_name,
       COALESCE(rating, 0)::INTEGER       AS rating,
       array_to_string(tags, ', ')        AS tags,
       description                        AS statement,
       COALESCE(editorial, '')            AS editorial
FROM read_parquet('hf://datasets/open-r1/codeforces@~parquet/default/partial-train/*.parquet')
WHERE description IS NOT NULL;

CREATE INDEX tasks_idx ON tasks_v
USING inverted(id, rating, title cf_en, statement cf_en, editorial cf_en, tags cf_en)
INCLUDE (id, contest_name, rating, title, statement, editorial, tags);

SELECT count(*) AS tasks FROM tasks_idx;

-- SOLUTIONS: reshape submissions in the view; row_number() gives a stable
-- unique id, length(source) gives code_len for shortest-first sorting.
CREATE VIEW solutions_v AS
SELECT row_number() OVER ()           AS id,
       problem_id                     AS task_id,
       source                         AS code,
       length(source)                 AS code_len,
       programmingLanguage            AS lang
FROM read_parquet(:'subs_glob')
WHERE verdict = 'OK' AND length(source) BETWEEN 20 AND 10000
  AND hash(submission_id) % :sample_mod = 0;

CREATE INDEX solutions_idx ON solutions_v
USING inverted(id, task_id, lang, code code_grams)
INCLUDE (id, task_id, code, code_len, lang);

SELECT count(*) AS solutions FROM solutions_idx;