Skip to content

Commit 9da2f23

Browse files
Merge pull request #15 from JRaviLab/minor_regex_change
Minor regex change --> cleanData
2 parents 6bcddb6 + 394cfd7 commit 9da2f23

1 file changed

Lines changed: 5 additions & 5 deletions

File tree

R/data_processing.R

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -756,15 +756,15 @@ runPanaroo2Duckdb <- function(duckdb_path,
756756
# This finds the reference cluster ID and names the cluster with it
757757
ref_line <- grep("\\*$", cluster_lines, value = TRUE)
758758
ref_id <- if (length(ref_line) > 0) {
759-
stringr::str_extract(ref_line, "fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+")
759+
stringr::str_extract(ref_line, "fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+")
760760
} else {
761761
paste0("Cluster_", i - 1)
762762
}
763763

764764
# Pull genome IDs
765765
genome_matches <- stringr::str_match(
766766
cluster_lines,
767-
"fig\\|([0-9]+\\.[0-9]+)\\.peg\\.[0-9]+"
767+
"fig\\|([0-9]+\\.[0-9]+)\\.peg(?:sc)?\\.[0-9]+"
768768
)[, 2]
769769
genome_matches <- genome_matches[!is.na(genome_matches)]
770770

@@ -874,8 +874,8 @@ buildMatrices <- function(cluster_map) .buildProtMatrices(cluster_map)
874874
names_faa <- names(cdhit_output_faa) |>
875875
tibble::as_tibble() |>
876876
dplyr::mutate(
877-
proteinID = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"),
878-
locus_tag = stringr::str_match(value, "peg\\.[0-9]+\\|([^\\s]+)")[, 2],
877+
proteinID = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+"),
878+
locus_tag = stringr::str_match(value, "peg(?:sc)?\\.[0-9]+\\|([^\\s]+)")[, 2],
879879
proteinName = stringr::str_trim(stringr::str_match(value, "\\|[^\\s]+\\s+(.*?)\\s+\\[")[, 2])
880880
) |>
881881
dplyr::select(-value)
@@ -924,7 +924,7 @@ CDHIT2duckdb <- function(duckdb_path,
924924
clustered_faa <- Biostrings::readAAStringSet(cdhit_outputs$clustered_faa)
925925
DBI::dbWriteTable(con, "protein_cluster_seq",
926926
tibble::tibble(
927-
name = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"),
927+
name = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+"),
928928
sequence = as.character(clustered_faa)
929929
),
930930
overwrite = TRUE

0 commit comments

Comments
 (0)