Source code for aldy.profile

# 786
# Aldy source: profile.py
#   This file is subject to the terms and conditions defined in
#   file 'LICENSE', which is part of this source code package.


from typing import Dict, List, Tuple, Optional
import pysam
import os
import os.path
import yaml

from collections import defaultdict
from natsort import natsorted
from .common import log, GRange, AldyException, script_path, chr_prefix
from .gene import Gene


[docs] class Profile: """Profile and model parameter information.""" def __init__(self, name, cn_region=None, data=None, **kwargs): self.name = name """Name of the profile.""" self.cn_region = cn_region """Location of the copy-number neutral region.""" self.data = data """Profile coverage data.""" self.gap = 0.0 """ Optimality gap. Non-zero values enable non-optimal solutions. Default: 0 (only optimal solutions) """ self.cn_solution = None """ User-specified copy-number configuration. Default: `None` (uses CN solver in :py:mod:`aldy.cn` for detection) """ self.neutral_value = 0.0 """ Joint coverage of the copy-number neutral region. Default: 0 (typically specified in the profile's YAML file) """ self.threshold = 0.5 """ Single-copy variant threshold. Its value indicate the fraction of total reads that contain the given variant in a single gene copy. For example, if two copies are given (maternal and paternal), and if the single copy coverage is 10, threshold of 0.5 will ensure that all variants with coverage less than 5 (i.e., 0.5 * 10) are filtered out. Default: 0.5 """ self.min_coverage = 2.0 """ Minimum coverage needed to call a variant. Default: 2 (5 for illumina/wgs) """ self.min_quality = 10 """ Minimum base quality for a read base to be considered. Default: 10 """ self.min_mapq = 10 """ Minimum mapping quality for a read to be considered. Default: 10 """ self.phase = True """ Set if the phasing model in :py:mod:`aldy.minor` is to be used. Default: `True` """ self.sam_long_reads = False """ Set if long reads should be split-mapped. Should be used when dealing with long PacBio or Nanopore reads Default: `False` (typically specified in the profile's YAML file) """ self.sam_mappy_preset = "map-hifi" """ Mappy preset to use for split-mapping. Default: map-hifi """ self.cn_max = 20 """ Maximum possible copy number of a gene. Default: 20 """ self.cn_pce_penalty = 2.0 """ Error penalty applied to the PCE region during CN calling (1 for no penalty). Default: 2.0 """ self.cn_diff = 10.0 """ The first CN objective term (coverage fit) coefficient. Default: 10.0 """ self.cn_fit = 1.0 """ The second CN objective term (gene fit) coefficient. Default: 1.0 """ self.cn_parsimony = 0.5 """ The third CN objective term (max. parsimony) coefficient. Default: 0.5 """ self.cn_fusion_left = 0.5 """ Extra penalty for the left fusions. Default: 0.5. """ self.cn_fusion_right = 0.25 """ Extra penalty for the right fusions. Default: 0.25. """ self.major_novel = 21.0 """ Penalty for novel functional mutation (0 for no penalty). Should be large enough to avoid calling novel mutations unless really necessary. Default: 21.0 (i.e., `max_cn + 1`) """ self.minor_miss = 1.5 """ Penalty for missed minor mutations (0 for no penalty). Ideally larger than `minor_add` as additions should be cheaper. Default: 1.5 """ self.minor_add = 1.0 """ Penalty for novel minor mutations (0 for no penalty). Zero penalty ensures that extra mutations are preferred over the coverage errors if the normalized variant slack coverage is >= 50%. Penalty of 1.0 prefers additions only if the variant slack coverage is >= 75%. Default: 1.0 """ self.minor_phase = 0.4 """ The minor star-allele calling model's phasing term coefficient. Default: 0.4 """ self.minor_phase_vars = 3_000 """ Number of variables to use during the phasing. Use lower number if the model takes too long to complete. Default: 3,000 """ self.male = False """ Set if the sample is male (i.e., has two X chromosomes). Used for calling sex chromosome genes (e.g., G6PD) when the CN calling is disabled. Default: False """ self.max_minor_solutions = 1 """ Maximum number of minor solutions to report for each major solution. Default: 1 """ self.display_format = False """ New novel allele display format. Default: False """ self.debug_probe = "" """ (Debug) Show raw data for a given mutation (e.g., I223M) """ self.debug_novel = False """ (Debug) Show potential novel functional mutations that are not in the database. """ self.min_avg_coverage = 2.0 """ Minimum average gene coverage needed for Aldy. Default: 2 """ self.vcf_sample_idx = 0 """ VCF sample index. Default: 0 """ self.indelpost = True """ Use indelpost for indel realignment (unless sam_long_reads is set). Default: True """ self.update(kwargs)
[docs] def update(self, kwargs): params = {} for n, v in kwargs.items(): if v is not None and n in self.__dict__: if n == "cn_solution": self.__dict__[n] = v else: try: if isinstance(self.__dict__[n], bool): self.__dict__[n] = not (v in ["False", "0"]) else: typ = type(self.__dict__[n]) self.__dict__[n] = typ(v) except (ValueError, TypeError): raise AldyException(f"Invalid parameter {n}: {v}") from None params[n] = self.__dict__[n] ps = "; ".join(f"{n}={v}" for n, v in params.items()) log.debug(f"[params] {ps}") return params
[docs] @staticmethod def load(gene, profile, cn_region=None, **params): """ Load the copy number profile and parameters from a profile file. :param gene: Gene instance. :param profile: A profile YAML or a SAM/BAM/CRAM file that contains profile data. :param cn_region: Copy-number neutral region. """ prof = None is_yml = True if os.path.exists(profile) and os.path.isfile(profile): ext = os.path.splitext(profile) if ext[-1] in [".bam", ".sam", ".cram"]: assert gene and cn_region regions = { (gene.name, r, gi): rng for gi, gr in enumerate(gene.regions) for r, rng in gr.items() } prof = Profile.get_sam_profile_data( profile, regions=regions, genome=gene.genome, cn_region=cn_region, ) is_yml = False else: with open(profile) as f: prof = yaml.safe_load(f) else: profile_path = script_path( "aldy.resources.profiles/{}.yml".format(profile.lower()) ) with open(profile_path) as f: prof = yaml.safe_load(f) if not prof: raise AldyException(f"Could not load profile from {profile}") if is_yml and profile != "illumina" and cn_region: raise AldyException("-n is only valid with illumina or BAM profile") if is_yml and profile == "illumina" and cn_region: prof["neutral"]["value"] = cn_region.end - cn_region.start if "neutral" not in prof or "value" not in prof["neutral"]: raise AldyException("Profile missing neutral region") if gene.name not in prof: raise AldyException(f"Profile missing {gene.name}") if gene.genome not in prof["neutral"]: raise AldyException(f"Profile {profile} not compatible with {gene.genome}") if cn_region is None: cn_region = GRange(*prof["neutral"][gene.genome]) return Profile( profile, cn_region, prof, neutral_value=prof["neutral"].get("value"), **dict(prof.get("options", {}), **params), )
[docs] @staticmethod def get_sam_profile_data( sam_path: str, ref_path: Optional[str] = None, regions: Dict[Tuple[str, str, int], GRange] = dict(), cn_region: Optional[GRange] = None, genome: Optional[str] = "hg19", params: Dict = dict(), ) -> Dict[str, Dict[str, List[float]]]: """ Load the profile information from a SAM/BAM/CRAM file. :param regions: List of regions to be extracted. :param cn_region: Copy-number neutral region. :return: list of tuples `(gene_name, chromosome, loci, coverage)`. .. note:: Profile samples used in the original Aldy paper: 1. PGRNseq-v1/v3: NA17642 2. PGRNseq-v2: NA19789 3. Illumina: by definition contains all ones (uniform coverage). """ if not genome: genome = "hg19" if len(regions) == 0: import importlib_resources gene_regions = {} for g in sorted( importlib_resources.files("aldy.resources.genes").iterdir() ): if g.suffix != ".yml": continue log.debug("Loading {}...", g) try: gg = Gene(script_path(f"aldy.resources.genes/{g}"), genome=genome) for gi, gr in enumerate(gg.regions): for r, rng in gr.items(): gene_regions[gg.name, r, gi] = rng except AldyException as e: log.warn(e) else: gene_regions = regions default_cn_neutral_region = { "hg19": GRange("22", 42547463, 42548249), "hg38": GRange("22", 42151472, 42152258), } gene_regions["neutral", "value", 0] = ( cn_region if cn_region else default_cn_neutral_region[genome] ) chr_regions: Dict[str, Tuple[int, int]] = {} for c, s, e in gene_regions.values(): if c not in chr_regions: chr_regions[c] = (s, e) else: chr_regions[c] = (min(s, chr_regions[c][0]), max(e, chr_regions[c][1])) cov: dict = defaultdict(lambda: defaultdict(int)) for c, (s, e) in natsorted(chr_regions.items()): if sam_path == "<illumina>": continue with pysam.AlignmentFile( # type: ignore sam_path, reference_filename=ref_path ) as sam: region = GRange(c, s, e).samtools( pad_left=1000, pad_right=1000, prefix=chr_prefix(c, [x["SN"] for x in sam.header["SQ"]]), ) log.info("Scanning {}...", region) try: for read in sam.fetch(region=region): start, s_start = read.reference_start, 0 if not read.cigartuples: continue for op, size in read.cigartuples: if op == 2: for i in range(size): cov[c][start + i] += 1 start += size elif op == 1: s_start += size elif op == 4: s_start += size elif op in [0, 7, 8]: for i in range(size): cov[c][start + i] += 1 start += size s_start += size except ValueError: log.warn("Cannot fetch {}", region) d: Dict = {} for (g, r, ri), (c, s, e) in gene_regions.items(): if g not in d: d[g] = {} if r not in d[g]: d[g][r] = [0] if ri >= len(d[g][r]): d[g][r].append(0) if sam_path == "<illumina>": d[g][r][ri] = e - s else: d[g][r][ri] = sum(cov[c][i] for i in range(s, e)) d["neutral"]["value"] = d["neutral"]["value"][0] d["neutral"][genome] = [*gene_regions["neutral", "value", 0]] if params: d["options"] = {} for k, v in Profile("").update(params).items(): d["options"][k] = v return d