@@ -779,6 +779,54 @@ runPanaroo2Duckdb <- function(duckdb_path,
779779 cluster_map
780780}
781781
782+ # ' Parse CD-HIT `.clstr` output into a long-format mapping
783+ # '
784+ # ' Reads a CD-HIT `.clstr` file and constructs a mapping of clusters to member feature ids.
785+ # '
786+ # ' @param clustered_faa Base path to CD-HIT output (without `.clstr` extension).
787+ # '
788+ # ' @return A tibble with columns `cluster` and `member`.
789+ # '
790+ # ' @keywords internal
791+ .extractMembersInClusters <- function (clustered_faa ) {
792+ clstr <- paste0(clustered_faa , " .clstr" )
793+ if (! file.exists(clstr )) {
794+ stop(" CD-HIT cluster file not found: " , clstr ,
795+ " \n Ensure .runCDHIT() completed successfully and produced the .clstr file." )
796+ }
797+
798+ lines <- data.table :: fread(clstr , sep = " \n " , header = FALSE )$ V1
799+ cluster_ids <- grep(" ^>Cluster" , lines )
800+ cluster_member <- data.table :: data.table()
801+
802+ for (i in seq_along(cluster_ids )) {
803+ start <- cluster_ids [i ] + 1
804+ end <- if (i < length(cluster_ids )) cluster_ids [i + 1 ] - 1 else length(lines )
805+ cluster_lines <- lines [start : end ]
806+
807+ # This finds the reference cluster ID and names the cluster with it
808+ ref_line <- grep(" \\ *$" , cluster_lines , value = TRUE )
809+ ref_id <- if (length(ref_line ) > 0 ) {
810+ stringr :: str_extract(ref_line , " fig\\ |[0-9]+\\ .[0-9]+\\ .peg(?:sc)?\\ .[0-9]+" )
811+ } else {
812+ paste0(" Cluster_" , i - 1 )
813+ }
814+
815+ # Pull genome IDs
816+ members <- stringr :: str_match(cluster_lines ,
817+ " fig\\ |([0-9]+\\ .[0-9]+)\\ .peg(?:sc)?\\ .[0-9]+" )[, 1 ]
818+ members <- members [! is.na(members )]
819+
820+ if (length(members ) > 0 ) {
821+ cluster_member <- data.table :: rbindlist(list (
822+ cluster_member ,
823+ data.table :: data.table(cluster = ref_id , member = members )
824+ ), use.names = TRUE )
825+ }
826+ }
827+
828+ tibble :: as_tibble(cluster_member )
829+ }
782830
783831# ' Build genome-by-protein-cluster count matrix
784832# '
@@ -877,6 +925,10 @@ CDHIT2duckdb <- function(duckdb_path,
877925 ),
878926 overwrite = TRUE
879927 )
928+
929+ cluster_member <- .extractMembersInClusters(cdhit_outputs $ clustered_faa )
930+ DBI :: dbWriteTable(con , " protein_members" , cluster_member , overwrite = TRUE )
931+
880932 invisible (TRUE )
881933}
882934
@@ -1433,6 +1485,7 @@ cleanData <- function(duckdb_path, path) {
14331485 protein_names_parquet <- file.path(path , " protein_names.parquet" )
14341486
14351487 protein_cluster_seq_parquet <- file.path(path , " protein_seqs.parquet" )
1488+ protein_cluster_member_parquet <- file.path(path , " protein_members.parquet" )
14361489
14371490 writeCompressedParquet <- function (df , path ) {
14381491 arrow :: write_parquet(
@@ -1502,6 +1555,9 @@ cleanData <- function(duckdb_path, path) {
15021555 DBI :: dbReadTable(con , " protein_cluster_seq" ) | > writeCompressedParquet(protein_cluster_seq_parquet )
15031556 DBI :: dbExecute(con_new , sprintf(" CREATE OR REPLACE VIEW protein_seqs AS SELECT * FROM read_parquet('%s')" , protein_cluster_seq_parquet ))
15041557
1558+ DBI :: dbReadTable(con , " protein_members" ) | > writeCompressedParquet(protein_cluster_member_parquet )
1559+ DBI :: dbExecute(con_new , sprintf(" CREATE OR REPLACE VIEW protein_members AS SELECT * FROM read_parquet('%s')" , protein_cluster_member_parquet ))
1560+
15051561 DBI :: dbReadTable(con , " genome_gene_protein" ) | > writeCompressedParquet(genome_gene_protein_parquet )
15061562 DBI :: dbExecute(con_new , sprintf(" CREATE OR REPLACE VIEW genome_gene_protein AS SELECT * FROM read_parquet('%s')" , genome_gene_protein_parquet ))
15071563
0 commit comments