Skip to content

Commit f04d3dc

Browse files
Add CD-HIT parsing function and update DB logic
Added a function to parse CD-HIT .clstr output into a long-format mapping of clusters to member feature ids. Updated database writing logic to include the new protein members table.
1 parent 5287d1b commit f04d3dc

1 file changed

Lines changed: 56 additions & 0 deletions

File tree

R/data_processing.R

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -779,6 +779,54 @@ runPanaroo2Duckdb <- function(duckdb_path,
779779
cluster_map
780780
}
781781

782+
#' Parse CD-HIT `.clstr` output into a long-format mapping
783+
#'
784+
#' Reads a CD-HIT `.clstr` file and constructs a mapping of clusters to member feature ids.
785+
#'
786+
#' @param clustered_faa Base path to CD-HIT output (without `.clstr` extension).
787+
#'
788+
#' @return A tibble with columns `cluster` and `member`.
789+
#'
790+
#' @keywords internal
791+
.extractMembersInClusters <- function(clustered_faa) {
792+
clstr <- paste0(clustered_faa, ".clstr")
793+
if (!file.exists(clstr)) {
794+
stop("CD-HIT cluster file not found: ", clstr,
795+
"\nEnsure .runCDHIT() completed successfully and produced the .clstr file.")
796+
}
797+
798+
lines <- data.table::fread(clstr, sep = "\n", header = FALSE)$V1
799+
cluster_ids <- grep("^>Cluster", lines)
800+
cluster_member <- data.table::data.table()
801+
802+
for (i in seq_along(cluster_ids)) {
803+
start <- cluster_ids[i] + 1
804+
end <- if (i < length(cluster_ids)) cluster_ids[i + 1] - 1 else length(lines)
805+
cluster_lines <- lines[start:end]
806+
807+
# This finds the reference cluster ID and names the cluster with it
808+
ref_line <- grep("\\*$", cluster_lines, value = TRUE)
809+
ref_id <- if (length(ref_line) > 0) {
810+
stringr::str_extract(ref_line, "fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+")
811+
} else {
812+
paste0("Cluster_", i - 1)
813+
}
814+
815+
# Pull genome IDs
816+
members <- stringr::str_match(cluster_lines,
817+
"fig\\|([0-9]+\\.[0-9]+)\\.peg(?:sc)?\\.[0-9]+")[, 1]
818+
members <- members[!is.na(members)]
819+
820+
if (length(members) > 0) {
821+
cluster_member <- data.table::rbindlist(list(
822+
cluster_member,
823+
data.table::data.table(cluster = ref_id, member = members)
824+
), use.names = TRUE)
825+
}
826+
}
827+
828+
tibble::as_tibble(cluster_member)
829+
}
782830

783831
#' Build genome-by-protein-cluster count matrix
784832
#'
@@ -877,6 +925,10 @@ CDHIT2duckdb <- function(duckdb_path,
877925
),
878926
overwrite = TRUE
879927
)
928+
929+
cluster_member <- .extractMembersInClusters(cdhit_outputs$clustered_faa)
930+
DBI::dbWriteTable(con, "protein_members", cluster_member, overwrite = TRUE)
931+
880932
invisible(TRUE)
881933
}
882934

@@ -1433,6 +1485,7 @@ cleanData <- function(duckdb_path, path) {
14331485
protein_names_parquet <- file.path(path, "protein_names.parquet")
14341486

14351487
protein_cluster_seq_parquet <- file.path(path, "protein_seqs.parquet")
1488+
protein_cluster_member_parquet <- file.path(path, "protein_members.parquet")
14361489

14371490
writeCompressedParquet <- function(df, path) {
14381491
arrow::write_parquet(
@@ -1502,6 +1555,9 @@ cleanData <- function(duckdb_path, path) {
15021555
DBI::dbReadTable(con, "protein_cluster_seq") |> writeCompressedParquet(protein_cluster_seq_parquet)
15031556
DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_seqs AS SELECT * FROM read_parquet('%s')", protein_cluster_seq_parquet))
15041557

1558+
DBI::dbReadTable(con, "protein_members") |> writeCompressedParquet(protein_cluster_member_parquet)
1559+
DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_members AS SELECT * FROM read_parquet('%s')", protein_cluster_member_parquet))
1560+
15051561
DBI::dbReadTable(con, "genome_gene_protein") |> writeCompressedParquet(genome_gene_protein_parquet)
15061562
DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_gene_protein AS SELECT * FROM read_parquet('%s')", genome_gene_protein_parquet))
15071563

0 commit comments

Comments
 (0)