@@ -756,15 +756,15 @@ runPanaroo2Duckdb <- function(duckdb_path,
756756 # This finds the reference cluster ID and names the cluster with it
757757 ref_line <- grep(" \\ *$" , cluster_lines , value = TRUE )
758758 ref_id <- if (length(ref_line ) > 0 ) {
759- stringr :: str_extract(ref_line , " fig\\ |[0-9]+\\ .[0-9]+\\ .peg\\ .[0-9]+" )
759+ stringr :: str_extract(ref_line , " fig\\ |[0-9]+\\ .[0-9]+\\ .peg(?:sc)? \\ .[0-9]+" )
760760 } else {
761761 paste0(" Cluster_" , i - 1 )
762762 }
763763
764764 # Pull genome IDs
765765 genome_matches <- stringr :: str_match(
766766 cluster_lines ,
767- " fig\\ |([0-9]+\\ .[0-9]+)\\ .peg\\ .[0-9]+"
767+ " fig\\ |([0-9]+\\ .[0-9]+)\\ .peg(?:sc)? \\ .[0-9]+"
768768 )[, 2 ]
769769 genome_matches <- genome_matches [! is.na(genome_matches )]
770770
@@ -874,8 +874,8 @@ buildMatrices <- function(cluster_map) .buildProtMatrices(cluster_map)
874874 names_faa <- names(cdhit_output_faa ) | >
875875 tibble :: as_tibble() | >
876876 dplyr :: mutate(
877- proteinID = stringr :: str_extract(value , " ^fig\\ |[0-9]+\\ .[0-9]+\\ .peg\\ .[0-9]+" ),
878- locus_tag = stringr :: str_match(value , " peg\\ .[0-9]+\\ |([^\\ s]+)" )[, 2 ],
877+ proteinID = stringr :: str_extract(value , " ^fig\\ |[0-9]+\\ .[0-9]+\\ .peg(?:sc)? \\ .[0-9]+" ),
878+ locus_tag = stringr :: str_match(value , " peg(?:sc)? \\ .[0-9]+\\ |([^\\ s]+)" )[, 2 ],
879879 proteinName = stringr :: str_trim(stringr :: str_match(value , " \\ |[^\\ s]+\\ s+(.*?)\\ s+\\ [" )[, 2 ])
880880 ) | >
881881 dplyr :: select(- value )
@@ -924,7 +924,7 @@ CDHIT2duckdb <- function(duckdb_path,
924924 clustered_faa <- Biostrings :: readAAStringSet(cdhit_outputs $ clustered_faa )
925925 DBI :: dbWriteTable(con , " protein_cluster_seq" ,
926926 tibble :: tibble(
927- name = names(clustered_faa ) | > stringr :: str_extract(" fig\\ |[0-9]+\\ .[0-9]+\\ .peg\\ .[0-9]+" ),
927+ name = names(clustered_faa ) | > stringr :: str_extract(" fig\\ |[0-9]+\\ .[0-9]+\\ .peg(?:sc)? \\ .[0-9]+" ),
928928 sequence = as.character(clustered_faa )
929929 ),
930930 overwrite = TRUE
0 commit comments