-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_codeforces.sql
More file actions
72 lines (63 loc) · 3.02 KB
/
Copy pathload_codeforces.sql
File metadata and controls
72 lines (63 loc) · 3.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
-- Codeforces corpus as PURE VIEWS over the Hugging Face parquet -- no COPY, no
-- rocksdb tables. All reshaping (rename, cast, filter, computed code_len, a
-- generated row_number() id) happens in the view; the index INCLUDEs every
-- column, so after the build the index is self-sufficient (it never re-reads
-- the parquet, even across restarts).
--
-- tasks <- open-r1/codeforces (statements, CC-BY-4.0)
-- solutions <- open-r1/codeforces-submissions (accepted code, CC-BY-4.0)
--
-- Scale/coverage controlled by the submission glob + an EVEN hash sample
-- (psql vars). A LIMIT would pile onto a few early problems; hash(submission_id)
-- % sample_mod = 0 samples ~1/sample_mod uniformly across ALL problems.
-- -v subs_glob='/abs/data/subs/*.parquet' -v sample_mod=10
\timing on
SET http_retries = 10;
SET http_retry_wait_ms = 3000;
DROP INDEX IF EXISTS tasks_idx;
DROP INDEX IF EXISTS solutions_idx;
DROP VIEW IF EXISTS tasks_v;
DROP VIEW IF EXISTS solutions_v;
DROP TABLE IF EXISTS tasks;
DROP TABLE IF EXISTS solutions;
DROP TEXT SEARCH DICTIONARY IF EXISTS cf_en;
DROP TEXT SEARCH DICTIONARY IF EXISTS code_grams;
DROP TEXT SEARCH DICTIONARY IF EXISTS code_grams_q;
CREATE TEXT SEARCH DICTIONARY cf_en(
template='text', locale='en_US.UTF-8', case='lower',
stemming=true, accent=false, frequency=true, position=true, norm=true);
CREATE TEXT SEARCH DICTIONARY code_grams(
template='sparse_ngram', frequency=true, norm=true);
CREATE TEXT SEARCH DICTIONARY code_grams_q(
template='sparse_ngram', covering=true);
-- TASKS: reshape statements in the view (rename description->statement,
-- flatten tags list -> string, coalesce nulls). id is already "1/A".
CREATE VIEW tasks_v AS
SELECT id,
title,
contest_name,
COALESCE(rating, 0)::INTEGER AS rating,
array_to_string(tags, ', ') AS tags,
description AS statement,
COALESCE(editorial, '') AS editorial
FROM read_parquet('hf://datasets/open-r1/codeforces@~parquet/default/partial-train/*.parquet')
WHERE description IS NOT NULL;
CREATE INDEX tasks_idx ON tasks_v
USING inverted(id, rating, title cf_en, statement cf_en, editorial cf_en, tags cf_en)
INCLUDE (id, contest_name, rating, title, statement, editorial, tags);
SELECT count(*) AS tasks FROM tasks_idx;
-- SOLUTIONS: reshape submissions in the view; row_number() gives a stable
-- unique id, length(source) gives code_len for shortest-first sorting.
CREATE VIEW solutions_v AS
SELECT row_number() OVER () AS id,
problem_id AS task_id,
source AS code,
length(source) AS code_len,
programmingLanguage AS lang
FROM read_parquet(:'subs_glob')
WHERE verdict = 'OK' AND length(source) BETWEEN 20 AND 10000
AND hash(submission_id) % :sample_mod = 0;
CREATE INDEX solutions_idx ON solutions_v
USING inverted(id, task_id, lang, code code_grams)
INCLUDE (id, task_id, code, code_len, lang);
SELECT count(*) AS solutions FROM solutions_idx;