Skip to content

Commit b0c35f3

Browse files
committed
feat: add support for local parquet files in Duck application
1 parent e46fdd1 commit b0c35f3

File tree

3 files changed

+60
-7
lines changed

3 files changed

+60
-7
lines changed

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ duck_ccf_local_files: build
4444
@echo "warning! only works on Common Crawl Foundation's development machine"
4545
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args="ccf_local_files"
4646

47+
duck_local_files: build
48+
ifndef LOCAL_DIR
49+
$(error LOCAL_DIR is required. Usage: make duck_local_files LOCAL_DIR=/path/to/data)
50+
endif
51+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args="local_files $(LOCAL_DIR)"
52+
4753
duck_cloudfront: build
4854
@echo "warning! this might take 1-10 minutes"
4955
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args="cloudfront"

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,14 @@ If you want to run many of these queries, and you have a lot of disk space, you'
796796
aws s3 sync s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ .'
797797
```
798798
799+
or (if you don't have access through the AWS CLI):
800+
801+
```shell
802+
TBA - add the download instructions
803+
```
804+
805+
then you can run `make duck_local_files LOCAL_DIR=/path/to/the/downloaded/data` to run the same query as above, but this time using your local copy of the index files.
806+
799807
> [!IMPORTANT]
800808
> If you happen to be using the Common Crawl Foundation development server, we've already downloaded these files, and you can run ```make duck_ccf_local_files```
801809

src/main/java/org/commoncrawl/whirlwind/Duck.java

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.nio.charset.StandardCharsets;
2525
import java.nio.file.Files;
2626
import java.nio.file.Path;
27+
import java.nio.file.Paths;
2728
import java.sql.*;
2829
import java.time.format.DateTimeFormatter;
2930
import java.util.*;
@@ -38,7 +39,7 @@ public class Duck {
3839
private static final DateTimeFormatter TIMESTAMP_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMddHHmmss");
3940

4041
public enum Algorithm {
41-
CCF_LOCAL_FILES("ccf_local_files"), CLOUDFRONT("cloudfront");
42+
CCF_LOCAL_FILES("ccf_local_files"), CLOUDFRONT("cloudfront"), LOCAL_FILES("local_files");
4243

4344
private final String name;
4445

@@ -113,8 +114,13 @@ public static void printRowAsKvList(ResultSet rs, PrintStream out) throws SQLExc
113114
/**
114115
* Gets the list of parquet files to query based on the algorithm.
115116
*/
116-
public static List<String> getFiles(Algorithm algo, String crawl) throws IOException {
117+
public static List<String> getFiles(Algorithm algo, String crawl, String localPrefix) throws IOException {
117118
switch (algo) {
119+
case LOCAL_FILES: {
120+
Path indexPath = Path.of(localPrefix);
121+
return getLocalParquetFiles(indexPath);
122+
}
123+
118124
case CCF_LOCAL_FILES: {
119125
Path indexPath = Path.of("/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc", "crawl=" + crawl,
120126
"subset=warc");
@@ -142,6 +148,23 @@ public static List<String> getFiles(Algorithm algo, String crawl) throws IOExcep
142148
}
143149
}
144150

151+
private static List<String> getLocalParquetFiles(Path indexPath) throws IOException {
152+
if (!Files.isDirectory(indexPath)) {
153+
System.err.println("Directory not found: " + indexPath);
154+
System.exit(1);
155+
}
156+
157+
List<String> files = Files.list(indexPath).filter(p -> p.toString().endsWith(".parquet")).map(Path::toString)
158+
.collect(Collectors.toList());
159+
160+
if (files.isEmpty()) {
161+
System.err.println("No parquet files found in: " + indexPath);
162+
System.exit(1);
163+
}
164+
165+
return files;
166+
}
167+
145168
private static List<String> getLocalParquetFiles(Path indexPath, String prefix, String crawl) throws IOException {
146169
if (!Files.isDirectory(indexPath)) {
147170
printIndexDownloadAdvice(prefix, crawl);
@@ -189,6 +212,7 @@ private static ResultSet executeWithRetry(Statement stmt, String sql) throws SQL
189212
public static void main(String[] args) {
190213
String crawl = "CC-MAIN-2024-22";
191214
Algorithm algo = Algorithm.CLOUDFRONT;
215+
String localPrefix = "/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc";
192216

193217
if (args.length > 0) {
194218
if ("help".equalsIgnoreCase(args[0]) || "--help".equals(args[0]) || "-h".equals(args[0])) {
@@ -200,20 +224,30 @@ public static void main(String[] args) {
200224
System.out.println("Using algorithm: " + algo.getName());
201225
}
202226

227+
if (algo == Algorithm.LOCAL_FILES) {
228+
if (args.length < 2) {
229+
System.err.println("Error: local_files algorithm requires a directory argument.");
230+
printUsage();
231+
System.exit(1);
232+
}
233+
localPrefix = args[1];
234+
}
235+
203236
try {
204-
run(algo, crawl);
237+
run(algo, crawl, localPrefix);
205238
} catch (Exception e) {
206239
System.err.println("Error: " + e.getMessage());
207240
printUsage();
208241
System.exit(1);
209242
}
210243
}
211244

212-
public static void run(Algorithm algo, String crawl) throws IOException, SQLException, InterruptedException {
245+
public static void run(Algorithm algo, String crawl, String localPrefix)
246+
throws IOException, SQLException, InterruptedException {
213247
// Ensure stdout uses UTF-8
214248
PrintStream out = new PrintStream(System.out, true, StandardCharsets.UTF_8);
215249

216-
List<String> files = getFiles(algo, crawl);
250+
List<String> files = getFiles(algo, crawl, localPrefix);
217251
String filesList = files.stream().map(f -> "'" + f + "'").collect(Collectors.joining(", "));
218252

219253
// Use in-memory DuckDB
@@ -304,14 +338,19 @@ private static void printResultSet(ResultSet rs, PrintStream out) throws SQLExce
304338
}
305339

306340
private static void printUsage() {
307-
System.err.println("Usage: Duck [algorithm]");
341+
System.err.println("Usage: Duck [algorithm] [local-directory]");
308342
System.err.println();
309343
System.err.println("Query Common Crawl index using DuckDB.");
310344
System.err.println();
311345
System.err.println("Algorithms:");
312-
System.err.println(" ccf_local_files Use local parquet files from /home/cc-pds/commoncrawl/...");
346+
System.err.println(" local_files Use local parquet files (from specified local directory)");
347+
System.err.println(
348+
" ccf_local_files Use local parquet files (default: /home/cc-pds/commoncrawl/cc-index/table/cc-main/warc)");
313349
System.err.println(" cloudfront Use CloudFront URLs (requires <crawl>.warc.paths.gz file)");
314350
System.err.println();
351+
System.err.println("Arguments:");
352+
System.err.println(" local-directory Local directory prefix for 'local_files' algorithm");
353+
System.err.println();
315354
System.err.println("Options:");
316355
System.err.println(" help, --help, -h Show this help message");
317356
}

0 commit comments

Comments
 (0)