2424import java .nio .charset .StandardCharsets ;
2525import java .nio .file .Files ;
2626import java .nio .file .Path ;
27+ import java .nio .file .Paths ;
2728import java .sql .*;
2829import java .time .format .DateTimeFormatter ;
2930import java .util .*;
@@ -38,7 +39,7 @@ public class Duck {
3839 private static final DateTimeFormatter TIMESTAMP_FORMATTER = DateTimeFormatter .ofPattern ("yyyyMMddHHmmss" );
3940
4041 public enum Algorithm {
41- CCF_LOCAL_FILES ("ccf_local_files" ), CLOUDFRONT ("cloudfront" );
42+ CCF_LOCAL_FILES ("ccf_local_files" ), CLOUDFRONT ("cloudfront" ), LOCAL_FILES ( "local_files" ) ;
4243
4344 private final String name ;
4445
@@ -113,8 +114,13 @@ public static void printRowAsKvList(ResultSet rs, PrintStream out) throws SQLExc
113114 /**
114115 * Gets the list of parquet files to query based on the algorithm.
115116 */
116- public static List <String > getFiles (Algorithm algo , String crawl ) throws IOException {
117+ public static List <String > getFiles (Algorithm algo , String crawl , String localPrefix ) throws IOException {
117118 switch (algo ) {
119+ case LOCAL_FILES : {
120+ Path indexPath = Path .of (localPrefix );
121+ return getLocalParquetFiles (indexPath );
122+ }
123+
118124 case CCF_LOCAL_FILES : {
119125 Path indexPath = Path .of ("/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc" , "crawl=" + crawl ,
120126 "subset=warc" );
@@ -142,6 +148,23 @@ public static List<String> getFiles(Algorithm algo, String crawl) throws IOExcep
142148 }
143149 }
144150
151+ private static List <String > getLocalParquetFiles (Path indexPath ) throws IOException {
152+ if (!Files .isDirectory (indexPath )) {
153+ System .err .println ("Directory not found: " + indexPath );
154+ System .exit (1 );
155+ }
156+
157+ List <String > files = Files .list (indexPath ).filter (p -> p .toString ().endsWith (".parquet" )).map (Path ::toString )
158+ .collect (Collectors .toList ());
159+
160+ if (files .isEmpty ()) {
161+ System .err .println ("No parquet files found in: " + indexPath );
162+ System .exit (1 );
163+ }
164+
165+ return files ;
166+ }
167+
145168 private static List <String > getLocalParquetFiles (Path indexPath , String prefix , String crawl ) throws IOException {
146169 if (!Files .isDirectory (indexPath )) {
147170 printIndexDownloadAdvice (prefix , crawl );
@@ -189,6 +212,7 @@ private static ResultSet executeWithRetry(Statement stmt, String sql) throws SQL
189212 public static void main (String [] args ) {
190213 String crawl = "CC-MAIN-2024-22" ;
191214 Algorithm algo = Algorithm .CLOUDFRONT ;
215+ String localPrefix = "/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc" ;
192216
193217 if (args .length > 0 ) {
194218 if ("help" .equalsIgnoreCase (args [0 ]) || "--help" .equals (args [0 ]) || "-h" .equals (args [0 ])) {
@@ -200,20 +224,30 @@ public static void main(String[] args) {
200224 System .out .println ("Using algorithm: " + algo .getName ());
201225 }
202226
227+ if (algo == Algorithm .LOCAL_FILES ) {
228+ if (args .length < 2 ) {
229+ System .err .println ("Error: local_files algorithm requires a directory argument." );
230+ printUsage ();
231+ System .exit (1 );
232+ }
233+ localPrefix = args [1 ];
234+ }
235+
203236 try {
204- run (algo , crawl );
237+ run (algo , crawl , localPrefix );
205238 } catch (Exception e ) {
206239 System .err .println ("Error: " + e .getMessage ());
207240 printUsage ();
208241 System .exit (1 );
209242 }
210243 }
211244
212- public static void run (Algorithm algo , String crawl ) throws IOException , SQLException , InterruptedException {
245+ public static void run (Algorithm algo , String crawl , String localPrefix )
246+ throws IOException , SQLException , InterruptedException {
213247 // Ensure stdout uses UTF-8
214248 PrintStream out = new PrintStream (System .out , true , StandardCharsets .UTF_8 );
215249
216- List <String > files = getFiles (algo , crawl );
250+ List <String > files = getFiles (algo , crawl , localPrefix );
217251 String filesList = files .stream ().map (f -> "'" + f + "'" ).collect (Collectors .joining (", " ));
218252
219253 // Use in-memory DuckDB
@@ -304,14 +338,19 @@ private static void printResultSet(ResultSet rs, PrintStream out) throws SQLExce
304338 }
305339
306340 private static void printUsage () {
307- System .err .println ("Usage: Duck [algorithm]" );
341+ System .err .println ("Usage: Duck [algorithm] [local-directory] " );
308342 System .err .println ();
309343 System .err .println ("Query Common Crawl index using DuckDB." );
310344 System .err .println ();
311345 System .err .println ("Algorithms:" );
312- System .err .println (" ccf_local_files Use local parquet files from /home/cc-pds/commoncrawl/..." );
346+ System .err .println (" local_files Use local parquet files (from specified local directory)" );
347+ System .err .println (
348+ " ccf_local_files Use local parquet files (default: /home/cc-pds/commoncrawl/cc-index/table/cc-main/warc)" );
313349 System .err .println (" cloudfront Use CloudFront URLs (requires <crawl>.warc.paths.gz file)" );
314350 System .err .println ();
351+ System .err .println ("Arguments:" );
352+ System .err .println (" local-directory Local directory prefix for 'local_files' algorithm" );
353+ System .err .println ();
315354 System .err .println ("Options:" );
316355 System .err .println (" help, --help, -h Show this help message" );
317356 }
0 commit comments