gwas_norm.variants
sub-package#
gwas_norm.variants.vcf_info
#
VCF info constants and parsers
- gwas_norm.variants.vcf_info.ALLELE = VepKeys(vep_name='Allele', key_name='allele', parser=<function parse_return>)#
Name and parser for the VEP Allele field (VepKeys)
- gwas_norm.variants.vcf_info.CADD_INFO_FIELD = 'CADD'#
The name of the info field where the CADD data is stored (str)
- gwas_norm.variants.vcf_info.CADD_KEY_LOOKUP = {'CADD_PHRED': CaddKeys(cadd_name='CADD_PHRED', key_name='cadd_phred', parser=<function parse_float>), 'CADD_RAW': CaddKeys(cadd_name='CADD_RAW', key_name='cadd_raw', parser=<function parse_float>)}#
A lookup dictionary to match the CADD names that occur in the VCF header definition to thier respective CaddKeys definitions and parsers located within. (dict)
- gwas_norm.variants.vcf_info.CADD_MAIN_DELIMITER = '|'#
The delimiter that separates the CADD fields (str)
- gwas_norm.variants.vcf_info.CADD_PHRED = CaddKeys(cadd_name='CADD_PHRED', key_name='cadd_phred', parser=<function parse_float>)#
Name and parser for the CADD phred field (VepKeys)
- gwas_norm.variants.vcf_info.CADD_RAW = CaddKeys(cadd_name='CADD_RAW', key_name='cadd_raw', parser=<function parse_float>)#
Name and parser for the CADD raw field (VepKeys)
- gwas_norm.variants.vcf_info.CLINORIGIN_BI = ClinVarOri(int=64, name='biparental', display_name='bi-parental')#
ClinVar bi-parental origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINORIGIN_DEN = ClinVarOri(int=32, name='de-novo', display_name='de-novo')#
ClinVar de-novo origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINORIGIN_GERM = ClinVarOri(int=1, name='germline', display_name='Germline')#
ClinVar germline origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINORIGIN_INC = ClinVarOri(int=512, name='tested-inconclusive', display_name='tested-inconclusive')#
ClinVar tested inconclusive origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINORIGIN_INH = ClinVarOri(int=4, name='inherited', display_name='Inherited')#
ClinVar inherited origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINORIGIN_LOOKUP = {0: ClinVarOri(int=0, name='unknown', display_name='Unknown'), 1: ClinVarOri(int=1, name='germline', display_name='Germline'), 2: ClinVarOri(int=2, name='somatic', display_name='Somatic'), 4: ClinVarOri(int=4, name='inherited', display_name='Inherited'), 8: ClinVarOri(int=8, name='paternal', display_name='Paternal'), 16: ClinVarOri(int=16, name='maternal', display_name='Maternal'), 32: ClinVarOri(int=32, name='de-novo', display_name='de-novo'), 64: ClinVarOri(int=64, name='biparental', display_name='bi-parental'), 128: ClinVarOri(int=128, name='uniparental', display_name='uni-parental'), 256: ClinVarOri(int=256, name='not-tested', display_name='not-tested'), 512: ClinVarOri(int=512, name='tested-inconclusive', display_name='tested-inconclusive'), 1073741824: ClinVarOri(int=1073741824, name='other', display_name='Other')}#
All the ClinVar origin data fields as a lookup dictionary with keys being the
int
field (int) and values being the ClinVarOri named tuples. (dict)
- gwas_norm.variants.vcf_info.CLINORIGIN_MAT = ClinVarOri(int=16, name='maternal', display_name='Maternal')#
ClinVar maternal origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINORIGIN_NT = ClinVarOri(int=256, name='not-tested', display_name='not-tested')#
ClinVar not tested origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINORIGIN_OTH = ClinVarOri(int=1073741824, name='other', display_name='Other')#
ClinVar other origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINORIGIN_PAT = ClinVarOri(int=8, name='paternal', display_name='Paternal')#
ClinVar paternal origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINORIGIN_SOM = ClinVarOri(int=2, name='somatic', display_name='Somatic')#
ClinVar somatic cell origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINORIGIN_UNI = ClinVarOri(int=128, name='uniparental', display_name='uni-parental')#
ClinVar uni-parental origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINORIGIN_UNKN = ClinVarOri(int=0, name='unknown', display_name='Unknown')#
ClinVar unknown origin data (ClinVarOri)
- gwas_norm.variants.vcf_info.CLINSIG_AFF = ClinVarSig(int=13, rank=16, name='affects', display_name='Affects')#
ClinVar affects data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_ASSC = ClinVarSig(int=10, rank=32, name='association', display_name='Association')#
ClinVar association data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_BEN = ClinVarSig(int=2, rank=1024, name='benign', display_name='Benign')#
ClinVar benign data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_CONF = ClinVarSig(int=12, rank=1, name='conflict', display_name='Conflict')#
ClinVar conflict data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_DRUG = ClinVarSig(int=6, rank=64, name='drug_response', display_name='Drug Response')#
ClinVar drug response data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_ERR = ClinVarSig(int=0, rank=2147483648, name='error', display_name='Error')#
ClinVar error data. This is an invention of gwas-norm to cover ClinVar parse failures (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_LBEN = ClinVarSig(int=3, rank=512, name='likely_benign', display_name='Likely Benign')#
ClinVar likely benign data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_LOOKUP = {0: ClinVarSig(int=0, rank=4096, name='uncertain_significance', display_name='Uncertain Significance'), 1: ClinVarSig(int=1, rank=2048, name='not_provided', display_name='Not Provided'), 2: ClinVarSig(int=2, rank=1024, name='benign', display_name='Benign'), 3: ClinVarSig(int=3, rank=512, name='likely_benign', display_name='Likely Benign'), 4: ClinVarSig(int=4, rank=256, name='likely_pathogenic', display_name='Likely Pathogenic'), 5: ClinVarSig(int=5, rank=2, name='pathogenic', display_name='Pathogenic'), 6: ClinVarSig(int=6, rank=64, name='drug_response', display_name='Drug Response'), 8: ClinVarSig(int=8, rank=128, name='confers_sensitivity', display_name='Confers Sensitivity'), 9: ClinVarSig(int=9, rank=8, name='risk_factor', display_name='Risk factor'), 10: ClinVarSig(int=10, rank=32, name='association', display_name='Association'), 11: ClinVarSig(int=11, rank=4, name='protective', display_name='Protective'), 12: ClinVarSig(int=12, rank=1, name='conflict', display_name='Conflict'), 13: ClinVarSig(int=13, rank=16, name='affects', display_name='Affects'), 255: ClinVarSig(int=255, rank=8192, name='other', display_name='Other')}#
The ClinVar clinical significance data arranged in a lookup dictionary with keys being the
int
field and values the ClinVarSig namedtuples (dict)
- gwas_norm.variants.vcf_info.CLINSIG_LPATH = ClinVarSig(int=4, rank=256, name='likely_pathogenic', display_name='Likely Pathogenic')#
ClinVar likely pathogenic data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_NP = ClinVarSig(int=1, rank=2048, name='not_provided', display_name='Not Provided')#
ClinVar data not provided data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_OTH = ClinVarSig(int=255, rank=8192, name='other', display_name='Other')#
ClinVar other data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_PATH = ClinVarSig(int=5, rank=2, name='pathogenic', display_name='Pathogenic')#
ClinVar pathogenic data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_PROT = ClinVarSig(int=11, rank=4, name='protective', display_name='Protective')#
ClinVar protective data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_RISK = ClinVarSig(int=9, rank=8, name='risk_factor', display_name='Risk factor')#
ClinVar risk factor data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_SEN = ClinVarSig(int=8, rank=128, name='confers_sensitivity', display_name='Confers Sensitivity')#
ClinVar confers sensitivity data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINSIG_USIG = ClinVarSig(int=0, rank=4096, name='uncertain_significance', display_name='Uncertain Significance')#
ClinVar uncertain significance data (ClinVarSig)
- gwas_norm.variants.vcf_info.CLINVAR_ID_DELIMITER = ':'#
The ID delimiter of strings in ClinVar fields, these are used after processing either the CLINVAR_MAIN_DELIMITER or CLINVAR_SUB_DELIMITER and are used to separate a name from an ID for that name(str)
- gwas_norm.variants.vcf_info.CLINVAR_MAIN_DELIMITER = '|'#
The main delimiter of strings in ClinVar fields (str)
- gwas_norm.variants.vcf_info.CLINVAR_MAPPED_FIELDS = [ClinVarKeys(clinvar_name='CLNACC', key_name='clinvar_accession', parser=<function parse_clinvar_delim>), ClinVarKeys(clinvar_name='CLNSIG', key_name='clinvar_sig', parser=<function parse_clinvar_significance>), ClinVarKeys(clinvar_name='CLNDISDB', key_name='clinvar_dis_db', parser=<function parse_clinvar_disease_db>), ClinVarKeys(clinvar_name='CLNDN', key_name='clinvar_dis_name', parser=<function parse_clinvar_delim>), ClinVarKeys(clinvar_name='CLNREVSTAT', key_name='clinvar_review', parser=<function parse_clinvar_delim>)]#
These are paired delimited fields that should all contain the same number of CLINVAR_MAIN_DELIMITER characters and each index of a split field should be associated together. (list or ClinVarKeys)
- gwas_norm.variants.vcf_info.CLINVAR_ORIGIN = [ClinVarOri(int=0, name='unknown', display_name='Unknown'), ClinVarOri(int=1, name='germline', display_name='Germline'), ClinVarOri(int=2, name='somatic', display_name='Somatic'), ClinVarOri(int=4, name='inherited', display_name='Inherited'), ClinVarOri(int=8, name='paternal', display_name='Paternal'), ClinVarOri(int=16, name='maternal', display_name='Maternal'), ClinVarOri(int=32, name='de-novo', display_name='de-novo'), ClinVarOri(int=64, name='biparental', display_name='bi-parental'), ClinVarOri(int=128, name='uniparental', display_name='uni-parental'), ClinVarOri(int=256, name='not-tested', display_name='not-tested'), ClinVarOri(int=512, name='tested-inconclusive', display_name='tested-inconclusive'), ClinVarOri(int=1073741824, name='other', display_name='Other')]#
All the ClinVar origin data fields arranged in their bitwise order (list of ClinVarOri)
- gwas_norm.variants.vcf_info.CLINVAR_SIGNIF = [ClinVarSig(int=12, rank=1, name='conflict', display_name='Conflict'), ClinVarSig(int=5, rank=2, name='pathogenic', display_name='Pathogenic'), ClinVarSig(int=11, rank=4, name='protective', display_name='Protective'), ClinVarSig(int=9, rank=8, name='risk_factor', display_name='Risk factor'), ClinVarSig(int=13, rank=16, name='affects', display_name='Affects'), ClinVarSig(int=10, rank=32, name='association', display_name='Association'), ClinVarSig(int=6, rank=64, name='drug_response', display_name='Drug Response'), ClinVarSig(int=8, rank=128, name='confers_sensitivity', display_name='Confers Sensitivity'), ClinVarSig(int=4, rank=256, name='likely_pathogenic', display_name='Likely Pathogenic'), ClinVarSig(int=3, rank=512, name='likely_benign', display_name='Likely Benign'), ClinVarSig(int=2, rank=1024, name='benign', display_name='Benign'), ClinVarSig(int=1, rank=2048, name='not_provided', display_name='Not Provided'), ClinVarSig(int=0, rank=4096, name='uncertain_significance', display_name='Uncertain Significance'), ClinVarSig(int=255, rank=8192, name='other', display_name='Other')]#
The ClinVar clinical significance data arranged in order from most significant/relevant to least (list of ClinVarSig)
- gwas_norm.variants.vcf_info.CLINVAR_SUB_DELIMITER = '/'#
The sub delimiter of strings in ClinVar fields, these are used after processing the CLINVAR_MAIN_DELIMITER (str)
- gwas_norm.variants.vcf_info.CLNACC = ClinVarKeys(clinvar_name='CLNACC', key_name='clinvar_accession', parser=<function parse_clinvar_delim>)#
The name and parser for the ClinVar accession (ClinVarKeys)
- gwas_norm.variants.vcf_info.CLNDISDB = ClinVarKeys(clinvar_name='CLNDISDB', key_name='clinvar_dis_db', parser=<function parse_clinvar_disease_db>)#
The name and parser for the ClinVar disease name database field (ClinVarKeys)
- gwas_norm.variants.vcf_info.CLNDN = ClinVarKeys(clinvar_name='CLNDN', key_name='clinvar_dis_name', parser=<function parse_clinvar_delim>)#
The name and parser for the ClinVar disease name field (ClinVarKeys)
- gwas_norm.variants.vcf_info.CLNHGVS = ClinVarKeys(clinvar_name='CLNHGVS', key_name='clinvar_hgvs', parser=<function parse_return>)#
The name and parser for the ClinVar HGVS format field (ClinVarKeys)
- gwas_norm.variants.vcf_info.CLNORIGIN = ClinVarKeys(clinvar_name='CLNORIGIN', key_name='clinvar_origin', parser=<function parse_clinvar_origin>)#
The name and parser for the ClinVar origin field (ClinVarKeys)
- gwas_norm.variants.vcf_info.CLNREVSTAT = ClinVarKeys(clinvar_name='CLNREVSTAT', key_name='clinvar_review', parser=<function parse_clinvar_delim>)#
The name and parser for the ClinVar clinical review status field (ClinVarKeys)
- gwas_norm.variants.vcf_info.CLNSIG = ClinVarKeys(clinvar_name='CLNSIG', key_name='clinvar_sig', parser=<function parse_clinvar_significance>)#
The name and parser for the ClinVar clinical significance field (ClinVarKeys)
- gwas_norm.variants.vcf_info.CLNVI = ClinVarKeys(clinvar_name='CLNVI', key_name='clinvar_var_id', parser=<function parse_clinvar_var_id>)#
The name and parser for the ClinVar variant identifier field (ClinVarKeys)
- gwas_norm.variants.vcf_info.CODING_SEQUENCE = So(name='coding_sequence_variant', rank=1048576, impact='MODIFIER', id='SO:0001580', description='A sequence variant that changes the coding sequence', display_name='Coding sequence variant')#
The sequence ontology data for the coding sequence variant term (So)
- gwas_norm.variants.vcf_info.CONSEQUENCE = VepKeys(vep_name='Consequence', key_name='consequence', parser=<function parse_vep_consequence>)#
Name and parser for the VEP Consequence field (VepKeys)
- gwas_norm.variants.vcf_info.CONSEQUENCES = [So(name='transcript_ablation', rank=1, impact='HIGH', id='SO:0001893', description='A feature ablation whereby the deleted region includes a transcript feature', display_name='Transcript ablation'), So(name='splice_acceptor_variant', rank=2, impact='HIGH', id='SO:0001574', description="A splice variant that changes the 2 base region at the 3' end of an intron", display_name='Splice acceptor variant'), So(name='splice_donor_variant', rank=4, impact='HIGH', id='SO:0001575', description="A splice variant that changes the 2 base region at the 5' end of an intron", display_name='Splice donor variant'), So(name='stop_gained', rank=8, impact='HIGH', id='SO:0001587', description='A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened transcript', display_name='Stop gained'), So(name='frameshift_variant', rank=16, impact='HIGH', id='SO:0001589', description='A sequence variant which causes a disruption of the translational reading frame, because the number of nucleotides inserted or deleted is not a multiple of three', display_name='Frameshift variant'), So(name='stop_lost', rank=32, impact='HIGH', id='SO:0001578', description='A sequence variant where at least one base of the terminator codon (stop) is changed, resulting in an elongated transcript', display_name='Stop lost'), So(name='start_lost', rank=64, impact='HIGH', id='SO:0002012', description='A codon variant that changes at least one base of the canonical start codon', display_name='Start lost'), So(name='transcript_amplification', rank=128, impact='HIGH', id='SO:0001889', description='A feature amplification of a region containing a transcript', display_name='Transcript amplification'), So(name='inframe_insertion', rank=256, impact='MODERATE', id='SO:0001821', description='An inframe non synonymous variant that inserts bases into in the coding sequence', display_name='Inframe insertion'), So(name='inframe_deletion', rank=512, impact='MODERATE', id='SO:0001822', description='An inframe non synonymous variant that deletes bases from the coding sequence', display_name='Inframe deletion'), So(name='missense_variant', rank=1024, impact='MODERATE', id='SO:0001583', description='A sequence variant, that changes one or more bases, resulting in a different amino acid sequence but where the length is preserved', display_name='Missense variant'), So(name='protein_altering_variant', rank=2048, impact='MODERATE', id='SO:0001818', description='A sequence_variant which is predicted to change the protein encoded in the coding sequence', display_name='Protein altering variant'), So(name='splice_region_variant', rank=4096, impact='LOW', id='SO:0001630', description='A sequence variant in which a change has occurred within the region of the splice site, either within 1-3 bases of the exon or 3-8 bases of the intron', display_name='Splice region variant'), So(name='splice_donor_region_variant', rank=16384, impact='LOW', id='SO:0002170', description="A sequence variant that falls in the region between the 3rd and 6th base after splice junction (5' end of intron)", display_name='Splice donor region variant'), So(name='splice_donor_5th_base_variant', rank=8192, impact='LOW', id='SO:0001787', description='A sequence variant that causes a change at the 5th base pair after the start of the intron in the orientation of the transcript', display_name='Splice donor 5th base variant'), So(name='splice_polypyrimidine_tract_variant', rank=32768, impact='LOW', id='SO:0002169', description="A sequence variant that falls in the polypyrimidine tract at 3' end of intron between 17 and 3 bases from the end (acceptor -3 to acceptor -17)", display_name='Splice polypyrimidine tract variant'), So(name='incomplete_terminal_codon_variant', rank=65536, impact='LOW', id='SO:0001626', description='A sequence variant where at least one base of the final codon of an incompletely annotated transcript is changed', display_name='Incomplete terminal codon variant'), So(name='start_retained_variant', rank=131072, impact='LOW', id='SO:0002019', description='A sequence variant where at least one base in the start codon is changed, but the start remains', display_name='Start retained variant'), So(name='stop_retained_variant', rank=262144, impact='LOW', id='SO:0001567', description='A sequence variant where at least one base in the terminator codon is changed, but the terminator remains', display_name='Stop retained variant'), So(name='synonymous_variant', rank=131072, impact='LOW', id='SO:0001819', description='A sequence variant where there is no resulting change to the encoded amino acid', display_name='Synonymous variant'), So(name='coding_sequence_variant', rank=1048576, impact='MODIFIER', id='SO:0001580', description='A sequence variant that changes the coding sequence', display_name='Coding sequence variant'), So(name='mature_miRNA_variant', rank=2097152, impact='MODIFIER', id='SO:0001620', description='A transcript variant located with the sequence of the mature miRNA', display_name='Mature miRNA variant'), So(name='5_prime_UTR_variant', rank=4194304, impact='MODIFIER', id='SO:0001623', description="A UTR variant of the 5' UTR", display_name='5 prime UTR variant'), So(name='3_prime_UTR_variant', rank=8388608, impact='MODIFIER', id='SO:0001624', description="A UTR variant of the 3' UTR", display_name='3 prime UTR variant'), So(name='non_coding_transcript_exon_variant', rank=16777216, impact='MODIFIER', id='SO:0001792', description='A sequence variant that changes non-coding exon sequence in a non-coding transcript', display_name='Non coding transcript exon variant'), So(name='intron_variant', rank=33554432, impact='MODIFIER', id='SO:0001627', description='A transcript variant occurring within an intron', display_name='Intron variant'), So(name='NMD_transcript_variant', rank=67108864, impact='MODIFIER', id='SO:0001621', description='A variant in a transcript that is the target of NMD', display_name='NMD transcript variant'), So(name='non_coding_transcript_variant', rank=134217728, impact='MODIFIER', id='SO:0001619', description='A transcript variant of a non coding RNA gene', display_name='Non coding transcript variant'), So(name='upstream_gene_variant', rank=268435456, impact='MODIFIER', id='SO:0001631', description="A sequence variant located 5' of a gene", display_name='Upstream gene variant'), So(name='downstream_gene_variant', rank=536870912, impact='MODIFIER', id='SO:0001632', description="A sequence variant located 3' of a gene", display_name='Downstream gene variant'), So(name='TFBS_ablation', rank=1073741824, impact='MODIFIER', id='SO:0001895', description='A feature ablation whereby the deleted region includes a transcription factor binding site', display_name='TFBS ablation'), So(name='TFBS_amplification', rank=2147483648, impact='MODIFIER', id='SO:0001892', description='A feature amplification of a region containing a transcription factor binding site', display_name='TFBS amplification'), So(name='TF_binding_site_variant', rank=4294967296, impact='MODIFIER', id='SO:0001782', description='A sequence variant located within a transcription factor binding site', display_name='TF binding site variant'), So(name='regulatory_region_ablation', rank=8589934592, impact='MODERATE', id='SO:0001894', description='A feature ablation whereby the deleted region includes a regulatory region', display_name='Regulatory region ablation'), So(name='regulatory_region_amplification', rank=17179869184, impact='MODIFIER', id='SO:0001891', description='A feature amplification of a region containing a regulatory region', display_name='Regulatory region amplification'), So(name='feature_elongation', rank=34359738368, impact='MODIFIER', id='SO:0001907', description='A sequence variant that causes the extension of a genomic feature, with regard to the reference sequence', display_name='Feature elongation'), So(name='regulatory_region_variant', rank=68719476736, impact='MODIFIER', id='SO:0001566', description='A sequence variant located within a regulatory region', display_name='Regulatory region variant'), So(name='feature_truncation', rank=137438953472, impact='MODIFIER', id='SO:0001906', description='A sequence variant that causes the reduction of a genomic feature, with regard to the reference sequence', display_name='Feature truncation'), So(name='intergenic_variant', rank=274877906944, impact='MODIFIER', id='SO:0001629', description='A sequence variant located in the intergenic region, between genes', display_name='Intergenic variant')]#
All the sequence ontology terms arrange from the most severe (lowest) to the most benign (highest) (list or So)
- gwas_norm.variants.vcf_info.CONSEQUENCE_LOOKUP = {'3_prime_UTR_variant': So(name='3_prime_UTR_variant', rank=8388608, impact='MODIFIER', id='SO:0001624', description="A UTR variant of the 3' UTR", display_name='3 prime UTR variant'), '5_prime_UTR_variant': So(name='5_prime_UTR_variant', rank=4194304, impact='MODIFIER', id='SO:0001623', description="A UTR variant of the 5' UTR", display_name='5 prime UTR variant'), 'NMD_transcript_variant': So(name='NMD_transcript_variant', rank=67108864, impact='MODIFIER', id='SO:0001621', description='A variant in a transcript that is the target of NMD', display_name='NMD transcript variant'), 'TFBS_ablation': So(name='TFBS_ablation', rank=1073741824, impact='MODIFIER', id='SO:0001895', description='A feature ablation whereby the deleted region includes a transcription factor binding site', display_name='TFBS ablation'), 'TFBS_amplification': So(name='TFBS_amplification', rank=2147483648, impact='MODIFIER', id='SO:0001892', description='A feature amplification of a region containing a transcription factor binding site', display_name='TFBS amplification'), 'TF_binding_site_variant': So(name='TF_binding_site_variant', rank=4294967296, impact='MODIFIER', id='SO:0001782', description='A sequence variant located within a transcription factor binding site', display_name='TF binding site variant'), 'coding_sequence_variant': So(name='coding_sequence_variant', rank=1048576, impact='MODIFIER', id='SO:0001580', description='A sequence variant that changes the coding sequence', display_name='Coding sequence variant'), 'downstream_gene_variant': So(name='downstream_gene_variant', rank=536870912, impact='MODIFIER', id='SO:0001632', description="A sequence variant located 3' of a gene", display_name='Downstream gene variant'), 'feature_elongation': So(name='feature_elongation', rank=34359738368, impact='MODIFIER', id='SO:0001907', description='A sequence variant that causes the extension of a genomic feature, with regard to the reference sequence', display_name='Feature elongation'), 'feature_truncation': So(name='feature_truncation', rank=137438953472, impact='MODIFIER', id='SO:0001906', description='A sequence variant that causes the reduction of a genomic feature, with regard to the reference sequence', display_name='Feature truncation'), 'frameshift_variant': So(name='frameshift_variant', rank=16, impact='HIGH', id='SO:0001589', description='A sequence variant which causes a disruption of the translational reading frame, because the number of nucleotides inserted or deleted is not a multiple of three', display_name='Frameshift variant'), 'incomplete_terminal_codon_variant': So(name='incomplete_terminal_codon_variant', rank=65536, impact='LOW', id='SO:0001626', description='A sequence variant where at least one base of the final codon of an incompletely annotated transcript is changed', display_name='Incomplete terminal codon variant'), 'inframe_deletion': So(name='inframe_deletion', rank=512, impact='MODERATE', id='SO:0001822', description='An inframe non synonymous variant that deletes bases from the coding sequence', display_name='Inframe deletion'), 'inframe_insertion': So(name='inframe_insertion', rank=256, impact='MODERATE', id='SO:0001821', description='An inframe non synonymous variant that inserts bases into in the coding sequence', display_name='Inframe insertion'), 'intergenic_variant': So(name='intergenic_variant', rank=274877906944, impact='MODIFIER', id='SO:0001629', description='A sequence variant located in the intergenic region, between genes', display_name='Intergenic variant'), 'intron_variant': So(name='intron_variant', rank=33554432, impact='MODIFIER', id='SO:0001627', description='A transcript variant occurring within an intron', display_name='Intron variant'), 'mature_miRNA_variant': So(name='mature_miRNA_variant', rank=2097152, impact='MODIFIER', id='SO:0001620', description='A transcript variant located with the sequence of the mature miRNA', display_name='Mature miRNA variant'), 'missense_variant': So(name='missense_variant', rank=1024, impact='MODERATE', id='SO:0001583', description='A sequence variant, that changes one or more bases, resulting in a different amino acid sequence but where the length is preserved', display_name='Missense variant'), 'non_coding_transcript_exon_variant': So(name='non_coding_transcript_exon_variant', rank=16777216, impact='MODIFIER', id='SO:0001792', description='A sequence variant that changes non-coding exon sequence in a non-coding transcript', display_name='Non coding transcript exon variant'), 'non_coding_transcript_variant': So(name='non_coding_transcript_variant', rank=134217728, impact='MODIFIER', id='SO:0001619', description='A transcript variant of a non coding RNA gene', display_name='Non coding transcript variant'), 'protein_altering_variant': So(name='protein_altering_variant', rank=2048, impact='MODERATE', id='SO:0001818', description='A sequence_variant which is predicted to change the protein encoded in the coding sequence', display_name='Protein altering variant'), 'regulatory_region_ablation': So(name='regulatory_region_ablation', rank=8589934592, impact='MODERATE', id='SO:0001894', description='A feature ablation whereby the deleted region includes a regulatory region', display_name='Regulatory region ablation'), 'regulatory_region_amplification': So(name='regulatory_region_amplification', rank=17179869184, impact='MODIFIER', id='SO:0001891', description='A feature amplification of a region containing a regulatory region', display_name='Regulatory region amplification'), 'regulatory_region_variant': So(name='regulatory_region_variant', rank=68719476736, impact='MODIFIER', id='SO:0001566', description='A sequence variant located within a regulatory region', display_name='Regulatory region variant'), 'splice_acceptor_variant': So(name='splice_acceptor_variant', rank=2, impact='HIGH', id='SO:0001574', description="A splice variant that changes the 2 base region at the 3' end of an intron", display_name='Splice acceptor variant'), 'splice_donor_5th_base_variant': So(name='splice_donor_5th_base_variant', rank=8192, impact='LOW', id='SO:0001787', description='A sequence variant that causes a change at the 5th base pair after the start of the intron in the orientation of the transcript', display_name='Splice donor 5th base variant'), 'splice_donor_region_variant': So(name='splice_donor_region_variant', rank=16384, impact='LOW', id='SO:0002170', description="A sequence variant that falls in the region between the 3rd and 6th base after splice junction (5' end of intron)", display_name='Splice donor region variant'), 'splice_donor_variant': So(name='splice_donor_variant', rank=4, impact='HIGH', id='SO:0001575', description="A splice variant that changes the 2 base region at the 5' end of an intron", display_name='Splice donor variant'), 'splice_polypyrimidine_tract_variant': So(name='splice_polypyrimidine_tract_variant', rank=32768, impact='LOW', id='SO:0002169', description="A sequence variant that falls in the polypyrimidine tract at 3' end of intron between 17 and 3 bases from the end (acceptor -3 to acceptor -17)", display_name='Splice polypyrimidine tract variant'), 'splice_region_variant': So(name='splice_region_variant', rank=4096, impact='LOW', id='SO:0001630', description='A sequence variant in which a change has occurred within the region of the splice site, either within 1-3 bases of the exon or 3-8 bases of the intron', display_name='Splice region variant'), 'start_lost': So(name='start_lost', rank=64, impact='HIGH', id='SO:0002012', description='A codon variant that changes at least one base of the canonical start codon', display_name='Start lost'), 'start_retained_variant': So(name='start_retained_variant', rank=131072, impact='LOW', id='SO:0002019', description='A sequence variant where at least one base in the start codon is changed, but the start remains', display_name='Start retained variant'), 'stop_gained': So(name='stop_gained', rank=8, impact='HIGH', id='SO:0001587', description='A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened transcript', display_name='Stop gained'), 'stop_lost': So(name='stop_lost', rank=32, impact='HIGH', id='SO:0001578', description='A sequence variant where at least one base of the terminator codon (stop) is changed, resulting in an elongated transcript', display_name='Stop lost'), 'stop_retained_variant': So(name='stop_retained_variant', rank=262144, impact='LOW', id='SO:0001567', description='A sequence variant where at least one base in the terminator codon is changed, but the terminator remains', display_name='Stop retained variant'), 'synonymous_variant': So(name='synonymous_variant', rank=131072, impact='LOW', id='SO:0001819', description='A sequence variant where there is no resulting change to the encoded amino acid', display_name='Synonymous variant'), 'transcript_ablation': So(name='transcript_ablation', rank=1, impact='HIGH', id='SO:0001893', description='A feature ablation whereby the deleted region includes a transcript feature', display_name='Transcript ablation'), 'transcript_amplification': So(name='transcript_amplification', rank=128, impact='HIGH', id='SO:0001889', description='A feature amplification of a region containing a transcript', display_name='Transcript amplification'), 'upstream_gene_variant': So(name='upstream_gene_variant', rank=268435456, impact='MODIFIER', id='SO:0001631', description="A sequence variant located 5' of a gene", display_name='Upstream gene variant')}#
A lookup dictionary of the all the sequence ontology terms with the sequence ontology term name as keys and the So namedtuple as values (dict)
- class gwas_norm.variants.vcf_info.CaddKeys(cadd_name, key_name, parser)#
Bases:
tuple
The CaddKeys data structure holds information on how to parse the nested CADD data structure, the layout (order) of which is defined by | separated
cadd_name
fields in the VCF header. Thekey_name
field is a lowercase version of thecadd_name
and the parserfield
will hold a parsing function appropriate for the defined field (namedtuple)- cadd_name#
Alias for field number 0
- key_name#
Alias for field number 1
- parser#
Alias for field number 2
- class gwas_norm.variants.vcf_info.ClinVarKeys(clinvar_name, key_name, parser)#
Bases:
tuple
The ClinVarKeys data structure holds information on how to parse the nested ClinVar data structures. The
clinvar_name
is the name used in the INFO field. Thekey_name
field is a lowercase version of theclinvar_name
and the parserfield
will hold a parsing function appropriate for the defined field (namedtuple)- clinvar_name#
Alias for field number 0
- key_name#
Alias for field number 1
- parser#
Alias for field number 2
- class gwas_norm.variants.vcf_info.ClinVarOri(int, name, display_name)#
Bases:
tuple
For storage of clinvar clinical origin data. The fields are as follows (namedtuple):
int
- The integer that is associated with the significance string in clinvar. The documentation in the ClinVar VCF filed suggests that these can be combined by summing them. I am not entirely sure that is correct, rather, it looks like they can be combined using a bitwise|
.name
- The string name for the severity measure, lowercase and contains no spaces.display_name
- A display version of thename
may contain spaces and capital letters.
- display_name#
Alias for field number 2
- int#
Alias for field number 0
- name#
Alias for field number 1
- class gwas_norm.variants.vcf_info.ClinVarSig(int, rank, name, display_name)#
Bases:
tuple
For storage of clinvar clinical significance data. The fields are as follows (namedtuple):
int
- The integer that is associated with the significance string in clinvarrank
- This is not a ClinVar variable. It is sn approximate severity or relevance order that can be used to order the clinical significance designations that are derived from multiple sources. The lower values are more severe/relevant. This can also be used to bitwise combine multiple clinical severity scores.name
- The string name for the severity measure, lowercase and contains no spaces.display_name
- A display version of thename
may contain spaces and capital letters.
- display_name#
Alias for field number 3
- int#
Alias for field number 0
- name#
Alias for field number 2
- rank#
Alias for field number 1
- gwas_norm.variants.vcf_info.DOWNSTREAM = So(name='downstream_gene_variant', rank=536870912, impact='MODIFIER', id='SO:0001632', description="A sequence variant located 3' of a gene", display_name='Downstream gene variant')#
The sequence ontology data for the downstream variant term (So)
- gwas_norm.variants.vcf_info.FEATURE = VepKeys(vep_name='Feature', key_name='feature', parser=<function parse_return>)#
Name and parser for the VEP feature field (VepKeys)
- gwas_norm.variants.vcf_info.FEATURE_ELONGATION = So(name='feature_elongation', rank=34359738368, impact='MODIFIER', id='SO:0001907', description='A sequence variant that causes the extension of a genomic feature, with regard to the reference sequence', display_name='Feature elongation')#
The sequence ontology data for the feature elongation term (So)
- gwas_norm.variants.vcf_info.FEATURE_TYPE = VepKeys(vep_name='Feature_type', key_name='feature_type', parser=<function parse_return>)#
Name and parser for the VEP Feature type field (VepKeys)
- gwas_norm.variants.vcf_info.FEAT_TRUNCATION = So(name='feature_truncation', rank=137438953472, impact='MODIFIER', id='SO:0001906', description='A sequence variant that causes the reduction of a genomic feature, with regard to the reference sequence', display_name='Feature truncation')#
The sequence ontology data for the feature truncation variant term (So)
- gwas_norm.variants.vcf_info.FIVE_PRIME_UTR = So(name='5_prime_UTR_variant', rank=4194304, impact='MODIFIER', id='SO:0001623', description="A UTR variant of the 5' UTR", display_name='5 prime UTR variant')#
The sequence ontology data for the 5’ UTR variant term (So)
- gwas_norm.variants.vcf_info.FRAMESHIFT = So(name='frameshift_variant', rank=16, impact='HIGH', id='SO:0001589', description='A sequence variant which causes a disruption of the translational reading frame, because the number of nucleotides inserted or deleted is not a multiple of three', display_name='Frameshift variant')#
The sequence ontology data for the frameshift term (So)
- gwas_norm.variants.vcf_info.GENE = VepKeys(vep_name='Gene', key_name='gene', parser=<function parse_return>)#
Name and parser for the VEP Gene field (VepKeys)
- gwas_norm.variants.vcf_info.INCOMPLETE_TERMINAL_CODON = So(name='incomplete_terminal_codon_variant', rank=65536, impact='LOW', id='SO:0001626', description='A sequence variant where at least one base of the final codon of an incompletely annotated transcript is changed', display_name='Incomplete terminal codon variant')#
The sequence ontology data for the incomplete terminal codon variant term (So)
- gwas_norm.variants.vcf_info.INFRAME_DEL = So(name='inframe_deletion', rank=512, impact='MODERATE', id='SO:0001822', description='An inframe non synonymous variant that deletes bases from the coding sequence', display_name='Inframe deletion')#
The sequence ontology data for the inframe deletion term (So)
- gwas_norm.variants.vcf_info.INFRAME_INS = So(name='inframe_insertion', rank=256, impact='MODERATE', id='SO:0001821', description='An inframe non synonymous variant that inserts bases into in the coding sequence', display_name='Inframe insertion')#
The sequence ontology data for the inframe insertion term (So)
- gwas_norm.variants.vcf_info.INTERGENIC = So(name='intergenic_variant', rank=274877906944, impact='MODIFIER', id='SO:0001629', description='A sequence variant located in the intergenic region, between genes', display_name='Intergenic variant')#
The sequence ontology data for the intergenic variant term (So)
- gwas_norm.variants.vcf_info.INTRON = So(name='intron_variant', rank=33554432, impact='MODIFIER', id='SO:0001627', description='A transcript variant occurring within an intron', display_name='Intron variant')#
The sequence ontology data for the intron variant term (So)
- gwas_norm.variants.vcf_info.MATURE_MIRNA = So(name='mature_miRNA_variant', rank=2097152, impact='MODIFIER', id='SO:0001620', description='A transcript variant located with the sequence of the mature miRNA', display_name='Mature miRNA variant')#
The sequence ontology data for the mature miRNA variant term (So)
- gwas_norm.variants.vcf_info.MISSENSE = So(name='missense_variant', rank=1024, impact='MODERATE', id='SO:0001583', description='A sequence variant, that changes one or more bases, resulting in a different amino acid sequence but where the length is preserved', display_name='Missense variant')#
The sequence ontology data for the misssense term (So)
- gwas_norm.variants.vcf_info.NC_TRANS = So(name='non_coding_transcript_variant', rank=134217728, impact='MODIFIER', id='SO:0001619', description='A transcript variant of a non coding RNA gene', display_name='Non coding transcript variant')#
The sequence ontology data for the non-coding transcript variant term (So)
- gwas_norm.variants.vcf_info.NC_TRANS_EXON = So(name='non_coding_transcript_exon_variant', rank=16777216, impact='MODIFIER', id='SO:0001792', description='A sequence variant that changes non-coding exon sequence in a non-coding transcript', display_name='Non coding transcript exon variant')#
The sequence ontology data for the non-coding transcript exon variant term (So)
- gwas_norm.variants.vcf_info.NMD_TRANS = So(name='NMD_transcript_variant', rank=67108864, impact='MODIFIER', id='SO:0001621', description='A variant in a transcript that is the target of NMD', display_name='NMD transcript variant')#
The sequence ontology data for the nonsense mediated decay variant term (So)
- gwas_norm.variants.vcf_info.POLYPHEN = VepKeys(vep_name='PolyPhen', key_name='polyphen', parser=<function parse_float>)#
Name and parser for the VEP Polyphen field (VepKeys)
- gwas_norm.variants.vcf_info.PROT_ALTERING = So(name='protein_altering_variant', rank=2048, impact='MODERATE', id='SO:0001818', description='A sequence_variant which is predicted to change the protein encoded in the coding sequence', display_name='Protein altering variant')#
The sequence ontology data for the protein altering variant term (So)
- gwas_norm.variants.vcf_info.REG_REGION = So(name='regulatory_region_variant', rank=68719476736, impact='MODIFIER', id='SO:0001566', description='A sequence variant located within a regulatory region', display_name='Regulatory region variant')#
The sequence ontology data for the regulatory region variant term (So)
- gwas_norm.variants.vcf_info.REG_REGION_ABLATION = So(name='regulatory_region_ablation', rank=8589934592, impact='MODERATE', id='SO:0001894', description='A feature ablation whereby the deleted region includes a regulatory region', display_name='Regulatory region ablation')#
The sequence ontology data for the regulatory region ablation variant term (So)
- gwas_norm.variants.vcf_info.REG_REGION_AMP = So(name='regulatory_region_amplification', rank=17179869184, impact='MODIFIER', id='SO:0001891', description='A feature amplification of a region containing a regulatory region', display_name='Regulatory region amplification')#
The sequence ontology data for the regulatory region amplification variant term (So)
- gwas_norm.variants.vcf_info.SIFT = VepKeys(vep_name='SIFT', key_name='sift', parser=<function parse_float>)#
Name and parser for the VEP SIFT field (VepKeys)
- gwas_norm.variants.vcf_info.SPLICE_ACCEPTOR = So(name='splice_acceptor_variant', rank=2, impact='HIGH', id='SO:0001574', description="A splice variant that changes the 2 base region at the 3' end of an intron", display_name='Splice acceptor variant')#
The sequence ontology data for the splice acceptor term (So)
- gwas_norm.variants.vcf_info.SPLICE_DONOR = So(name='splice_donor_variant', rank=4, impact='HIGH', id='SO:0001575', description="A splice variant that changes the 2 base region at the 5' end of an intron", display_name='Splice donor variant')#
The sequence ontology data for the splice donor term (So)
- gwas_norm.variants.vcf_info.SPLICE_DONOR_FIFTH_BASE = So(name='splice_donor_5th_base_variant', rank=8192, impact='LOW', id='SO:0001787', description='A sequence variant that causes a change at the 5th base pair after the start of the intron in the orientation of the transcript', display_name='Splice donor 5th base variant')#
The sequence ontology data for the splice donor 5th base term (So)
- gwas_norm.variants.vcf_info.SPLICE_DONOR_REGION = So(name='splice_donor_region_variant', rank=16384, impact='LOW', id='SO:0002170', description="A sequence variant that falls in the region between the 3rd and 6th base after splice junction (5' end of intron)", display_name='Splice donor region variant')#
The sequence ontology data for the splice donor region term (So)
- gwas_norm.variants.vcf_info.SPLICE_POLYPRIM_TRACT = So(name='splice_polypyrimidine_tract_variant', rank=32768, impact='LOW', id='SO:0002169', description="A sequence variant that falls in the polypyrimidine tract at 3' end of intron between 17 and 3 bases from the end (acceptor -3 to acceptor -17)", display_name='Splice polypyrimidine tract variant')#
The sequence ontology for the splice polypyrimidine tract variant (So)
- gwas_norm.variants.vcf_info.SPLICE_REGION = So(name='splice_region_variant', rank=4096, impact='LOW', id='SO:0001630', description='A sequence variant in which a change has occurred within the region of the splice site, either within 1-3 bases of the exon or 3-8 bases of the intron', display_name='Splice region variant')#
The sequence ontology data for the splice region term (So)
- gwas_norm.variants.vcf_info.START_LOST = So(name='start_lost', rank=64, impact='HIGH', id='SO:0002012', description='A codon variant that changes at least one base of the canonical start codon', display_name='Start lost')#
The sequence ontology data for the start lost term (So)
- gwas_norm.variants.vcf_info.START_RETAINED = So(name='start_retained_variant', rank=131072, impact='LOW', id='SO:0002019', description='A sequence variant where at least one base in the start codon is changed, but the start remains', display_name='Start retained variant')#
The sequence ontology data for the start retained variant term (So)
- gwas_norm.variants.vcf_info.STOP_GAINED = So(name='stop_gained', rank=8, impact='HIGH', id='SO:0001587', description='A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened transcript', display_name='Stop gained')#
The sequence ontology data for the stop gained term (So)
- gwas_norm.variants.vcf_info.STOP_LOST = So(name='stop_lost', rank=32, impact='HIGH', id='SO:0001578', description='A sequence variant where at least one base of the terminator codon (stop) is changed, resulting in an elongated transcript', display_name='Stop lost')#
The sequence ontology data for the stop lost term (So)
- gwas_norm.variants.vcf_info.STOP_RETAINED = So(name='stop_retained_variant', rank=262144, impact='LOW', id='SO:0001567', description='A sequence variant where at least one base in the terminator codon is changed, but the terminator remains', display_name='Stop retained variant')#
The sequence ontology data for the stop retained variant term (So)
- gwas_norm.variants.vcf_info.SYNONYMOUS = So(name='synonymous_variant', rank=131072, impact='LOW', id='SO:0001819', description='A sequence variant where there is no resulting change to the encoded amino acid', display_name='Synonymous variant')#
The sequence ontology data for the synonymous variant term (So)
- class gwas_norm.variants.vcf_info.So(name, rank, impact, id, description, display_name)#
Bases:
tuple
A data structure to handle all the parameters associated with a sequence ontology term (namedtuple)
name
- The string name for the severity measure, lowercase and contains no spaces.rank
- This is not a sequence ontology variable. It is the approximate severity order that can be used to order the worst consequences of a variant in cases where the variant affects many transcripts. The lower values are more severe. This can also be used to bitwise combine multiple sequence ontology terms. (int)impact
- The qualitative general impact of a variant assigned one of these terms, UPPERCASE (str)id
- The sequence ontology ID for the term (starts withSO:
) (str)description
- A long form text description of the sequence ontology term. (str)display_name
- A display version of thename
may contain spaces and capital letters. (str)
- description#
Alias for field number 4
- display_name#
Alias for field number 5
- id#
Alias for field number 3
- impact#
Alias for field number 2
- name#
Alias for field number 0
- rank#
Alias for field number 1
- gwas_norm.variants.vcf_info.TFBS_ABLATION = So(name='TFBS_ablation', rank=1073741824, impact='MODIFIER', id='SO:0001895', description='A feature ablation whereby the deleted region includes a transcription factor binding site', display_name='TFBS ablation')#
The sequence ontology data for the transcription factor binding site ablation variant term (So)
- gwas_norm.variants.vcf_info.TFBS_AMP = So(name='TFBS_amplification', rank=2147483648, impact='MODIFIER', id='SO:0001892', description='A feature amplification of a region containing a transcription factor binding site', display_name='TFBS amplification')#
The sequence ontology data for the transcription factor binding site amplification term (So)
- gwas_norm.variants.vcf_info.TF_BINDING_SITE = So(name='TF_binding_site_variant', rank=4294967296, impact='MODIFIER', id='SO:0001782', description='A sequence variant located within a transcription factor binding site', display_name='TF binding site variant')#
The sequence ontology data for the transcription factor binding site variant term (So)
- gwas_norm.variants.vcf_info.THREE_PRIME_UTR = So(name='3_prime_UTR_variant', rank=8388608, impact='MODIFIER', id='SO:0001624', description="A UTR variant of the 3' UTR", display_name='3 prime UTR variant')#
The sequence ontology data for the 3’ UTR variant term (So)
- gwas_norm.variants.vcf_info.TRANS_ABLATION = So(name='transcript_ablation', rank=1, impact='HIGH', id='SO:0001893', description='A feature ablation whereby the deleted region includes a transcript feature', display_name='Transcript ablation')#
The sequence ontology data for the transcript ablation term (So)
- gwas_norm.variants.vcf_info.TRANS_AMP = So(name='transcript_amplification', rank=128, impact='HIGH', id='SO:0001889', description='A feature amplification of a region containing a transcript', display_name='Transcript amplification')#
The sequence ontology data for the transcript amplification term (So)
- gwas_norm.variants.vcf_info.UPSTREAM = So(name='upstream_gene_variant', rank=268435456, impact='MODIFIER', id='SO:0001631', description="A sequence variant located 5' of a gene", display_name='Upstream gene variant')#
The sequence ontology data for the upstream variant term (So)
- gwas_norm.variants.vcf_info.VCF_MISSING = '.'#
The symbol for missing data in a VCF file (str)
- gwas_norm.variants.vcf_info.VEP_INFO_FIELD = 'CSQ'#
The name of the info field where the VEP data is stored (str)
- gwas_norm.variants.vcf_info.VEP_KEY_LOOKUP = {'Allele': VepKeys(vep_name='Allele', key_name='allele', parser=<function parse_return>), 'Consequence': VepKeys(vep_name='Consequence', key_name='consequence', parser=<function parse_vep_consequence>), 'Feature': VepKeys(vep_name='Feature', key_name='feature', parser=<function parse_return>), 'Feature_type': VepKeys(vep_name='Feature_type', key_name='feature_type', parser=<function parse_return>), 'Gene': VepKeys(vep_name='Gene', key_name='gene', parser=<function parse_return>), 'PolyPhen': VepKeys(vep_name='PolyPhen', key_name='polyphen', parser=<function parse_float>), 'SIFT': VepKeys(vep_name='SIFT', key_name='sift', parser=<function parse_float>)}#
A lookup dictionary to match the VEP names that occur in the VCF header definition to their respective VepKeys definitions and parsers located within. (dict)
- gwas_norm.variants.vcf_info.VEP_MAIN_DELIMITER = '|'#
The delimiter that separates the vep fields (str)
- class gwas_norm.variants.vcf_info.VepKeys(vep_name, key_name, parser)#
Bases:
tuple
The VepKeys data structure holds information on how to parse the nested VEP data structure, the layout (order) of which is defined by | separated
vep_name
fields in the VCF header. Thekey_name
field is a lowercase version of thevep_name
and the parserfield
will hold a parsing function appropriate for the defined field (namedtuple)- key_name#
Alias for field number 1
- parser#
Alias for field number 2
- vep_name#
Alias for field number 0
- gwas_norm.variants.vcf_info.cadd_info_parser(row)#
Parse the CADD information field in the VCF header.
- Parameters:
row (str) – The text to extract the CADD entry structure from.
- Returns:
cadd_keys – The keys in the required order to parse the CADD entries from within the INFO field of the VCF body.
- Return type:
list or CaddKeys
Notes
This is a specific format, placed in the gwas norm mapping file and is arranged in a similar way to the embedded VEP data.
See also
- gwas_norm.variants.vcf_info.clinvar_most_significant(parsed_clinvar)#
Extract the moth significant ClinVar entry in terms of consequence.
- Parameters:
parsed_clinvar (list of dict) – A parsed clinvar INFO section
- Returns:
most_significant – A single clinvar entry containing the most significant consequence. the consequences are in a list of dict within the
dbs
field. This list will have the most significant clinvar hit at[0]
. The nested dict entries contain a clinvar_sig field that contains a ClinVarSig namedtuple.- Return type:
dict
See also
- gwas_norm.variants.vcf_info.parse_clinvar(info)#
Extract the clinvar annotations from an INFO object derived from the VCF body.
- Parameters:
row (pysam.VariantRecordInfo) – A record derived from a mapping file. This should contain all the clinvar INFO fields as they are defined in the dbSNP VCF file.
- Returns:
var_id – The variant identifier, if not available will be a
.
.- Return type:
str
Notes
Specifically, this looks for the following info fields:
CLNORIGIN
,CLNACC
,CLNSIG
,CLNDN
,CLNDISDB
,CLNREVSTAT
,CLNHGVS
,CLNVI
.This is designed to return
NoneType
if KeyErrors are encountered (i.e. any of the fields above are missing). However, other errors are re-raised with an output of all the clinvar fields so they can be debugged.
- gwas_norm.variants.vcf_info.parse_clinvar_dbs(info, idx)#
Parse the index matched strings with all the information contained in various databases.
- Parameters:
info (pysam.VariantRecordInfo) – A record derived from a mapping file. This should contain all the clinvar INFO fields as they are defined in the dbSNP VCF file.
idx (int) – The record number. The clinvar data for a single variant in the VCF file can sometimes have multiple entries, this is 0-based.
- Returns:
all_dbs – Each dictionary is a nested datastructure will all the matched clinvar data within in.
NoneType
is returned if the CLNACC field is missing.
orNoneType
.- Return type:
NoneType or list of dict
Notes
The following fields are extracted and split to extract the index matched data:
CLNACC
,CLNSIG
,CLNDISDB
,CLNDN
,CLNREVSTAT
- gwas_norm.variants.vcf_info.parse_clinvar_delim(delim_str)#
Parse generic clinvar | delimited fields.
- Parameters:
delim_str (str or NoneType) – A clinvar main delimited string string. If a string it can be
.
for missing otherwise could be a|
separated list of ClinVar entries.- Returns:
delim_list – A
NoneType
is returned if the field is missing (.
) orNoneType
.- Return type:
NoneType or list of str
- gwas_norm.variants.vcf_info.parse_clinvar_disease_db(delim_str)#
Parse ClinVar variant identifiers field (
CLNDISDB
).- Parameters:
delim_str (str or NoneType) – A clinvar main delimited string string. If a string it can be
.
for missing otherwise could be a|
separated list of ClinVar disease database entries. These are additionally delimited by a : into db_name:db_id.- Returns:
dis_db – A
NoneType
is returned if the field is missing (.
) orNoneType
. Otherwise the tuples in the list represent (db_name, db_id).- Return type:
NoneType or list of tuple of (str, str)
- gwas_norm.variants.vcf_info.parse_clinvar_origin(origin)#
Parse clinvar origin string. This should in
CLNORIGIN
.- Parameters:
origin (str) – A clincar origin string. This should be castable to an int and could represent a single origin integer or a bitwise combination of origins.
- Returns:
origins – A
NoneType
is returned if theorigin
string is not castable to an integer.- Return type:
NoneType or list of ClinVarOri
- gwas_norm.variants.vcf_info.parse_clinvar_significance(delim_str)#
Parse clinvar clinical significance field (
CLINSIG
).- Parameters:
delim_str (str or NoneType) – A clinvar main delimited string. If a string it can be
.
for missing otherwise could be a|
separated list of ClinVar clinical significance terms.- Returns:
clinsig – A
NoneType
is returned if the field is missing (.
) orNoneType
. Otherwise, ClinVarSig named tuples representing all the clinical significance definitions associated with the entry is returned.- Return type:
NoneType or list of ClinVarSig
- gwas_norm.variants.vcf_info.parse_clinvar_var_id(delim_str)#
Parse clinvar variant identifiers field (
CLINVI
).- Parameters:
clinvi (str or NoneType) – A clinvar main delimited string string. If a string it can be
.
for missing otherwise could be a|
separated list of ClinVar variant identifiers. These are additionally delimited by a : into var_name:var_id.- Returns:
delim_str – A
NoneType
is returned if the field is missing (.
) orNoneType
.- Return type:
NoneType or list of str
- gwas_norm.variants.vcf_info.parse_float(field, *args, **kwargs)#
Parse a float represented as a string into a proper float.
- Parameters:
field (str) – The field to cast into a float
*args – Any positional arguments (ignored)
**kwargs – Any keyword arguments (ignored)
- Returns:
field – The string cast into a float.
- Return type:
float
- gwas_norm.variants.vcf_info.parse_none(*args, **kwargs)#
A dummy function that performs no parsing.
- Parameters:
*args – Any positional arguments (ignored)
**kwargs – Any keyword arguments (ignored)
- Returns:
none – Simply return a
NoneType
when called.- Return type:
NoneType
Notes
This is designed as a placeholder function when using functional parsers, so it uses a generic interface that is designed not to fail but just to return a
NoneType
in all circumstances.
- gwas_norm.variants.vcf_info.parse_return(field, *args, **kwargs)#
A dummy function that returns what ever it has been passed.
- Parameters:
field (any) – The field to return
*args – Any positional arguments (ignored)
**kwargs – Any keyword arguments (ignored)
- Returns:
field – Simply return what ever has been passed.
- Return type:
str
Notes
This is designed as a placeholder function when using functional parsers. It acts as a pass-through and uses a generic interface that is designed not to fail but just to the
field
argument in all circumstances.
- gwas_norm.variants.vcf_info.parse_vep_consequence(field)#
Parse a VEP consequence field that has been extracted from a VEP string.
- Parameters:
field (str) – A field containing VEP consequences.
- Returns:
consequences – The mapped consequences.
- Return type:
list of So
Notes
The consequences string may be delimited with & if the variant overlaps with multiple transcripts.
- gwas_norm.variants.vcf_info.validate_header_metadata(metadata, name, number, dtype)#
Make sure the data in a VCF header field is the expected format
- Parameters:
metadata (pysam.VariantHeaderMetadata) – A variant header metadata object, so this could be the result of a call to pysam.VariantFile.header.formats or pysam.VariantFile.header.info.
name (str) – The name key to lookup in the metadata
number (int or str) – The expected value for the number field under the name, i.e.
1
orA
.dtype (str) – The expected value for the
type
attribute under the metadata name,
- Raises:
- gwas_norm.variants.vcf_info.vep_info_parser(row)#
Parse the VEP information order from a VCF info field.
- Parameters:
row (str) – The text in the VEP VCF info field
- Returns:
vep_keys – The keys (in the required order) to be applied to the VEP INFO fields in the VCF.
- Return type:
list of VepKeys
Notes
The VEP INFO field in the header of a VCF file is frequently called
CSQ
(consequences), this function should be given the text in the description field of the consequences entry. This contains a pipe separated list of the various VEP information in the order it will be given in the INFO fields of the VCF body. The VepKeys will contain descriptions and parsers for their respective data.
- gwas_norm.variants.vcf_info.vep_worst_consequence(parsed_vep)#
Extract the VEP entry with the worst predicted consequence.
- Parameters:
parsed_vep (list of dict) – A parsed VEP INFO section.
- Returns:
worst_consequence – A single VEP entry containing the worst predicted consequence. The variant has multiple VEP entries for each transcript involved and potentially each alternate allele of the variant. Within the nested dict entries the consequences are in a list under the
consequence
key, with the worst consequence being represented by a So namedtuple at[0]
in this list.- Return type:
dict
Notes
The list containing the So named tuples might not need to be a list, I have only every found a single So named tuple in each one, so this may change in future.
gwas_norm.variants.norm
#
Tools for normalising to the reference genome sequence and for normalising INDELs as per
- class gwas_norm.variants.norm.EnsemblRefNorm(rest_client, *args, **kwargs)#
Bases:
RefNorm
Handles interactions with the reference genome sequence for reference allele lookups and INDEL normalisations.
- Parameters:
rest_client (ensembl_rest_client.client.Rest) – An object for interacting with the Ensembl REST API.
*args – Arguments to the gwas_norm.variants.norm.RefNorm
**kwargs – Keyword arguments to the gwas_norm.variants.norm.RefNorm
Notes
This version queries the Ensembl REST client to get the reference sequence information.
- close()#
A dummy close method, to avoid opening reference genome files.
- open()#
A dummy open method, to avoid opening reference genome files.
- search_assembly(chr_name, start_pos, end_pos, strand=1, species='human', data_format='plain', **kwargs)#
Search the reference genome assembly for sequences encompassed by
chr_name
,start_pos
andend_pos
.- Parameters:
chr_name (str) – The chromosome to extract from.
start_pos (int) – The 1-based start position of chr_name.
end_pos (int) – The end position
strand (int, optional, default: 1) – The strand for the returned sequence.
species (str, optional, default: human) – The species for the sequence.
data_format (str, optional, default: plain) – The data format for the sequence, can be either
fasta
orplain
. I have not triedfasta
before.**kwargs – Keyword arguments passed to ensembl_rest_client.client.Rest.get_sequence_region.
- Returns:
sequence – The sequence encompassed by the coordinates.
- Return type:
str
- class gwas_norm.variants.norm.RefNorm(ref_fasta, index=None, cachesize=100000)#
Bases:
object
Handles interactions with the reference genome sequence for reference allele lookups and INDEL normalisations.
- Parameters:
ref_fasta (str) – An indexed FASTA reference sequence
index (str or NoneType, optional, default: NoneType) – An index file, if
NoneType
, then it is assumed to be the same name as the reference FASTA file.cachesize (int, optional, default: 100000) – The size of sequences to store in memory
- cache_ref_assembly_match(chr_name, start_pos, alleles)#
Given the coordinates and multiple alleles return a boolean list indicating which alleles match the reference genome assembly.
This is useful if you suspect that the ref and alt alleles have been swapped around for some reason.
This version will lookup against a pre-cached reference assembly sequence. The idea behind the cache is to store a string of reference assembly in memory that will be used for several lookups, for example if multiple consecutive sites are being queried. However, it does not seem to confer much (ANY) performance benefit but is left here for reference.
- Parameters:
chr_name (str) – The chromosome containing the reference allele.
start_pos (int) – The start position of the reference allele.
alleles (list) – The allele to check against the reference genome assembly.
- Returns:
matches – True if the allele matches the reference genome assembly, False if not
- Return type:
list of bool
- close()#
Close a FASTA reference genome
- static get_minimal_alleles(start_pos, ref, alt)#
Get the minimal representation of a variant, based on the ref + alt alleles in a VCF this is used to make sure that multiallelic variants in different datasets, with different combinations of alternate alleles , can always be matched directly. Taken from: here <https://github.com/ericminikel/minimal_representation/blob/master/minimal_representation.py>.
- Parameters:
start_pos (int) – The start position of the reference allele.
ref (str) – The reference allele.
alt (str) – The alternate allele.
- Returns:
start_pos (int) – The start position of the minimally represented reference allele.
ref (str) – The minimally represented reference allele.
alt (str) – The minimally represented alternate allele.
- normalise_alleles(chr_name, start_pos, ref, alt)#
Normalise INDEL alleles according to Tan et al. 2015 . This code also heavily based on this .
- Parameters:
chr_name (str) – The chromosome containing the reference allele.
start_pos (int) – The start position of the reference allele.
ref (str) – The reference allele must be
ATCGNatcgn-
. The reference allele will be checked against the reference genome assembly.alt (str) – The alternate allele must be
ATCGNatcgn-
.
- Returns:
chr_name (str) – The chromosome containing the reference allele.
start_pos (int) – The start position of the normalised reference allele.
norm_ref (str) – The normalised reference allele.
norm_ref (str) – The normalised alternate allele.
was_normalised (bool) – True if the alleles have undergone normalisation, False if not
- Raises:
common.SequenceError – If either the reference allele or alternate allele sequence is not
ATCGNatcgn-
ValueError – If the ref and alt alleles are the same
KeyError – If the reference allele can not be found in the reference assembly
- normalise_multi_alleles(chr_name, start_pos, ref, *alts)#
Normalise multi-allelic INDEL alleles according to Tan et al. 2015 . This code also heavily based on this .
- Parameters:
chr_name (str) – The chromosome containing the reference allele.
start_pos (int) – The start position of the reference allele.
ref (str) – The reference allele must be
ATCGNatcgn-
. The reference allele will be checked against the reference genome assembly.*alt (str) – One or more alternate allele must be
ATCGNatcgn-
.
- Returns:
chr_name (str) – The chromosome containing the reference allele.
start_pos (int) – The start position of the normalised reference allele.
norm_ref (str) – The normalised reference allele.
norm_alts (tuple of str) – The normalised alternate alleles.
was_normalised (bool) – True if the alleles have undergone normalisation, False if not
- Raises:
common.SequenceError – If either the reference allele or alternate allele sequence is not
ATCGNatcgn-
ValueError – If the ref and alt alleles are the same
KeyError – If the reference allele can not be found in the reference assembly
- open()#
Open a FASTA reference genome
- ref_assembly_match(chr_name, start_pos, alleles)#
Given the coordinates and multiple alleles return a boolean list indicating which alleles match the reference genome assembly.
This is useful if you suspect that the ref and alt alleles have been swapped around for some reason.
- Parameters:
chr_name (str) – The chromosome containing the reference allele.
start_pos (int) – The start position of the reference allele.
alleles (list) – The allele to check against the reference genome assembly.
- Returns:
matches – True if the allele matches the reference genome assembly, False if not
- Return type:
list of bool
- search_assembly(chr_name, start_pos, end_pos)#
Search the reference genome assembly for sequences encompassed by
chr_name
,start_pos
andend_pos
.- Parameters:
chr_name (str) – The chromosome to extract from.
start_pos (int) – The 1-based start position of chr_name.
end_pos (int) – The end position
- Returns:
sequence – The sequence encompassed by the coordinates.
- Return type:
str
- valid_ref(chr_name, start_pos, ref_allele)#
Verify that the given reference allele matches the reference assembly.
- Parameters:
chr_name (str) – The chromosome containing the reference allele.
start_pos (int) – The start position of the reference allele.
ref_allele (str) – The reference allele to check against the reference genome assembly
- Returns:
match – True if the reference allele matches the reference genome assembly, False if not
- Return type:
bool
- gwas_norm.variants.norm.test_coords(x, y)#
Find the overlap between two regions. The start and end coordinates are cast to int before comparing
- Returns:
overlap – The overlap between the x site and the y site or 0 if there is no overlap
- Return type:
int
gwas_norm.variants.mapper
#
Handle variant mapping and annotation based on the mapping VCF
- class gwas_norm.variants.mapper.BaseMapper(resolver=None, ref_genome=None, ref_genome_idx=None)#
Bases:
object
A base class for the mapper, do not use directly
- Parameters:
resolver (gwas_norm.variants.resolvers.BaseResolver) – A resolver with methods to attempt to rescue poor quality mappings and imputation of the ALT allele. If
NoneType
then the base class is used. This returns a no mapping result for both methods.ref_genome (str) – The path to an indexed FASTA file containing a reference assembly that can be used by the mapper if needed. The reference assembly will be used in cases where variants have not been mapped so the source variant is normalised (if an INDEL) just incase it is the way the indel is provided that is causing it not to map.
ref_genome_idx (str) – The path to the index file for the reference genome. If this is
NoneType
it is assumed to have the same basename as theref_genome
.
- DNA_REGEXP = re.compile('^[ATCGatcg-]+$')#
- best_mapping(source_chr_name, source_start_pos, source_ref_allele, source_alt_allele, mapping_rows, resolve=True, existing_flags=0, input_row=None, strand=1, var_id=None)#
Run the mapping algorithm for a single source variant against the localised matching mapping rows.
- Parameters:
source_chr_name (str) – The chromosome name of the source variant
source_start_pos (int) – The start position of the source variant
source_ref_allele (str) – The reference allele of the source variant
source_alt_allele (str) – The alternate allele of the source variant. If this is
NoneType
then the alternate allele will be attempted to be assigned from the appropriate mapping in mapping_rows.mapping_rows (list of tuple or list) – Potential matching mappings for the source variant. These should represent rows from a mapping vcf file. So the
chr_name
at[0]
,start_pos
at[1]
,var_id
at[2]
,ref_allele
at[3]
,alt_allele
at[4]
and an input row at[5]
(list but can be empty). It is assumed that the alt allele is bi-alleilic.resolve (bool, optional, default: True) – In the event of no mapping, that is the mappings do not reach sufficient quality. Should the resolution method get called. This is mainly here to stop infinite recursion should the resolution method have to call _map_variant again.
existing_flags (int, optional, default: 0) – Any existing mapping flags that need to be added to the map_bits. This is so recursive calls can pass mapping information to each other.
input_row (Any, optional, default: NoneType) – Any input rows that you want to store alongside the mapping. Typically these will be a list representing data in columns but could be anything.
strand (int, optional, default: 1) – The strand for the source variant, should be either
1
(forward) or-1
(reverse).var_id (str or NoneType, optional, default: NoneType) – Any existing identifiers for the variant. This will be passed to the resolution methods in the event of no mapping.
Notes
Whilst this can be called directly usually it is called from the
map_variant
method. As themap_variant
method will typically deal with things like INDEL normalisation (and subsequent re-localisation) as well. This assumes that a set of rows have already been localised, probably based on chr:start_pos (although these are checked by this function as well). So this will take localised rows and define the best mapping row or return a no mapping result if none of them are any good.
- close()#
Close anything used by the mapping class
- classmethod decode_mapping_flags(flags)#
Decode the bitwise mapping flags into human readable strings.
- Parameters:
flags (int) – The bitwise flags to decode.
- Returns:
decoded_flags – The bitwise flags decoded into human readable strings.
- Return type:
list of str
- classmethod get_mapping_error(source_coords, error=None, nsites=0, input_row=None)#
return a mapping with ERROR bits and the associated error (if available).
- Parameters:
source_coords (tuple) – The coordinates of the variant that we are trying to map. Should have
chr_name
at[0]
,start_pos
at[1]
,ref_allele
at[2]
andalt_allele
at[3]
.error (Exception, optional, default: NoneType) – An exception to add to the error mapping.
nsites (int, optional, default: 0) – The number of sites in the mapping.
input_row (Any) – The row from the input file that we tried to map. Typically this will be a list but could be anything.
- Returns:
error_mapping – A mapping result with mapper.ERROR.bits
- Return type:
mapper.MappingResult
- classmethod get_mapping_var_id(mapping)#
Extract the variant identifier from a mapping, the variant identifier will be in the map_row (currently at index 2).
- Parameters:
mapping (list) – This has the source variant at
[0]
, mapping variant at[1]
, the mapping row at[2]
and thedata_bits
(map_bits
) at[3]
.- Returns:
var_id – The variant identifier, this baseclass version will return
''
- Return type:
str
- classmethod get_no_data_mapping(source_coords, nsites=0, input_row=None)#
return an empty no data mapping
- Returns:
no_data_mapping – A mapping result with mapper.vc.NO_DATA.bits
- Return type:
mapper.MappingResult
- map_variant(*args)#
Placeholder for the map_variant method
- Parameters:
*args – Ignored
- Raises:
NotImplementedError – Indicate that it needs overriding
- open()#
Initialise the source and mapping files
- classmethod order_mappings(source_chr_name, source_start_pos, source_ref_allele, source_alt_allele, mapping_rows, strand=1)#
Order potential mappings (broadly localised mappings) from most relevant to least relevant.
There is a threshold criteria below which a localised mapping nothing is not deemed to map to the source variant attributes so will be omitted from the returned list entirely.
- Parameters:
source_chr_name (str) – The chromosome name of the source variant
source_start_pos (int) – The start position of the source variant
source_ref_allele (str) – The reference allele of the source variant
source_alt_allele (str) – The alternate allele of the source variant. If this is NoneType then the alternate allele will be attempted to be assigned from the appropriate mapping in mapping_rows.
mapping_rows (list of tuple) – Potential matching mappings for the source variant. These should represent rows from a mapping vcf file. So the chr_name at [0], start_pos at [1], var_id at [2], ref_allele at [3], alt_allele at [4]. It is assumed that the alt allele is bi-allelic.
strand (int, optional, default: 1) – The strand for the match can be either 1 or -1. This is optional as most strand information is not available in files so we assume 1.
- Returns:
mappings – The mapping_rows aligned with the source data in order from the mapping row with the best match to the source data to the mapping row with the worst match to the source data.
- Return type:
list of list
- classmethod quick_match(source, mapping)#
Define how well a variant from an untrusted source matches one from a trusted source.
- Parameters:
source (tuple) – The first variant to match, this is assumed to be derived from an untrusted source and we are assessing how well it matches the variant from the trusted source. The untrusted label means that the alt-allele can be set to NoneType and the alleles can be I/D/R designations. This tuple should have
chr_name
(str),start_pos
(int),strand
(1/-1) followed by separate elements for all the alleles, ref and alt.mapping (tuple) – The second variant to match, this is assumed to have “complete” information, i.e. derived from a trusted and robust source. The trusted source label means that the alt-allele must be present and all the alleles must be DNA sequences i.e. ATCG. This tuple should have
chr_name
(str),start_pos
(int),strand
(1/-1) followed by separate elements for all the alleles, ref and alt.
- Returns:
source (tuple) – The mapping variant tuple of
chr_name
(str),start_pos
(int),strand
(1/-1) followed by separate elements for all the alleles, ref and alt.mapping (tuple) – The mapping variant tuple of
chr_name
(str),start_pos
(int),strand
(1/-1) followed by separate elements for all the alleles, ref and alt.data_bits (int) – The mapping bits between source and mapping variants
- Raises:
ValueError – If the source allele types can’t be recognised or the mapping alleles are not DNA
Notes
This performs basic matching between two variants represented in tuples of
chr_name
(str),start_pos
(int),strand
(1/-1), followed by separate elements for all the alleles.Currently,
quick_match
only supports bi-alleilic variants and assumes that the source alleles and the mapping alleles are all the same case. The source variant can be from the ambiguous source and the mapping variant should be from a solid data source. As such the alternate allele in the source can be NoneType but both alleles must be present in the mapping variant. Similarly, the source variant can be an I/D/R alleilic representation but the mapping variant must be made up from ATCG. The matching algorithm is as follows. Also, ensembl format - is not supported:Attempt to match the chromosome, if there is no match then return vc.NO_DATA.
Attempt to match the start position, if there is no match then return CHR.
Extract the alleles from the source and mapping.
Is the source reference allele DNA, if so, see step 5. If not see step 9.
Is the source alt allele DNA or NoneType, if DNA see step 6. If
NoneType
see step 8. If neither of these - error.Do the source and mapping alleles match? If so return, if not see step 7.
Flip strand and test the alleles again and return the result.
Test the source ref allele for a match to the mapping. If it matches return. If not see step 7.
Are the source alleles I/D/R? If so see step 10. if not raise an error.
I/D/R alleles are currently not handled.
- property resolver#
Return the resolver used by the mapper (resolvers)
- static sort_map(mapping)#
A sort key method for ordering the mapping rows.
- Parameters:
mapping (tuple) – The mapping bits should be at element [3].
- validate_resolver()#
Make sure the resolver is compatible with the data source for the mapper
- classmethod var_id_match(mappings, source_var_id)#
Assess the mappings to see if they match the
source_var_id
. If they do adjust themap_bits
accordingly.- Parameters:
mappings (list of list) – Each element is a localised mapping and each localised mapping list has the structure: source variant at
[0]
, mapping variant at[1]
, the mapping row at[2]
and thedata_bits
(map_bits
) at[3]
.source_var_id (str or NoneType) –
- Returns:
mappings – Each element is a localised mapping and each localised mapping list has the structure: source variant at
[0]
, mapping variant at[1]
, the mapping row at[2]
and thedata_bits
(map_bits
) at[3]
. The return is not strictly required as the mapping bits are updated in place.- Return type:
list of list
- class gwas_norm.variants.mapper.EnsemblVariantMapper(rest_client, *args, **kwargs)#
Bases:
BaseMapper
A variant mapper that localises variant coordinates based on queries against the Ensembl REST API.
- Parameters:
rest_client (ensembl_rest_client.client.Rest) – An object for interacting with the Ensembl REST API.
*args – Arguments to the gwas_norm.variants.mapper.BaseMapper
**kwargs – Keyword arguments to the gwas_norm.variants.mapper.BaseMapper
Notes
This is only suitable for small queries and not mapping millions of variants. If no reference genome is passed then any allele normalisation is based on queries against the Ensembl Rest API.
- DNA_REGEXP = re.compile('^[ATCGatcg-]+$')#
A compiled regular expression for recognising DNA strings with deletion symbols - (re.Pattern)
- classmethod get_mapping_var_id(mapping)#
Extract the variant identifier from a mapping, the variant identifier will be in the map_row (currently at index 2).
- Parameters:
mapping (list) – This has the source variant at
[0]
, mapping variant at[1]
, the mapping row at[2]
and thedata_bits
(map_bits
) at[3]
.- Returns:
var_id – The variant identifier
- Return type:
str
- map_variant(chr_name, start_pos, ref_allele, *args, alt_allele=None, strand=1, allele_norm=True, existing_flags=0, var_id=None, **kwargs)#
Map a single variant using the Ensembl REST API.
- Parameters:
chr_name (str) – The chromosome name of the variant
start_pos (int) – The 1-based start position for the variant
ref_allele (str) – The reference allele for the variant, allowed values are
ATCGatcg
or-
for a deletion.*args – Any positional arguments (ignored).
alt_allele (str, optional, default:
NoneType
) – The alternate allele, allowed values areATCGatcg
or-
for a deletion orNoneType
.strand (int, optional, default:
1
) – The strand for the variant (if known), should be 1 for positive/forward strand or -1 for negative/reverse strand.allele_norm (bool, optional, default: True) – In the event of no mapping, do you want to allele normalise and attempt to map again (if normalisation has occurred).
existing_flags (int, optional, default: 0) – Any existing mapping flags that you want to pass through to the final mapping. The end user should not need to touch this and it is mainly meant for recursive calls or for subclasses to use.
var_id (str or NoneType, optional, default: NoneType) – Any existing identifiers for the variant. This will be passed to the resolution methods in the event of no mapping.
*kwargs – Any keyword arguments (ignored).
- query_region(chr_name, start_pos, end_pos)#
Query for the region using the REST API.
- Parameters:
chr_name (str) – The chromosome name of the variant
start_pos (int) – The 1-based start position for the variant (post ensemblisation if an INDEL)
end_pos (int) – The end position for the variant
- Returns:
rest_data – This should be the result of an ensembl_rest_client.overlap.Overlap.get_region_overlap() query.
- Return type:
list of dict
- class gwas_norm.variants.mapper.ScanVcfVariantMapper(source_variants, mapping_vcf, tabix_vcf=None, chr_name='chr_name', start_pos='start_pos', strand=None, ref_allele='ref_allele', alt_allele=None, var_id=None, header=True, buffer=1000, source_join_key=None, mapping_join_key=None, buffer_sort_key=None, **kwargs)#
Bases:
BaseMapper
Map and annotate source variants from a flat input file (or file like object) against the mapping vcf file.
- Parameters:
source_variants (iterator) – An object that has a
__next__
method implemented and can serve up a row from the file. The row should be a represented as a list. The row that is given by the iterator does not have to have the start position as an integer, that is cast internally. However, if header is True, then the first row given by the iterator should be the header row. Thesource_variants
file should also be sorted in the same way as themapping_vcf
mapping file.mapping_vcf (gwas_norm.variants.mapper.VcfIterator) – An object that will iterate through the VCF mapping file, this file.
header (bool, optional, default: True) – Does the
source_variants
file have a header. If True this should be the first row given by the iterator.tabix_vcf (list of gwas_norm.variants.mapper.TabixVcfVariantMapper or) –
NoneType (NoneType) – A mapping VCF file containing rare variants, this file is searched with tabix rather than scanned and it is only searched if a variant is not available in the
mapping_vcf
. It is designed to be accessed infrequently only as a last resort.optional (NoneType) – A mapping VCF file containing rare variants, this file is searched with tabix rather than scanned and it is only searched if a variant is not available in the
mapping_vcf
. It is designed to be accessed infrequently only as a last resort.default (NoneType) – A mapping VCF file containing rare variants, this file is searched with tabix rather than scanned and it is only searched if a variant is not available in the
mapping_vcf
. It is designed to be accessed infrequently only as a last resort.chr_name (str, optional, default: ‘chr_name’) – The name for the chromosome name column of
source_variants
file. Ifheader
isFalse
, then this should be the column number.start_pos (str, optional, default: ‘start_pos’) – The name of the start position column of
source_variants
file. Ifheader
isFalse
, then this should be the column number.ref_allele (str, optional, default: ‘ref_allele’) – The name of the reference allele column (or effect allele) of
source_variants
file. Ifheader
isFalse
, then this should be the column number.alt_allele (str or NoneType, optional, default: NoneType) – The name of the alternate allele column (or other allele), this should be
NoneType
if thesource_variants
file has noalt_allele
column.var_id (str or NoneType, optional, default: NoneType) – The name of the variant identifier column in the
source_variants
file. Ifheader
isFalse
, then this should be the column number.header – Does the
source_variants
file contain a header row. If so it should be the first row given by the iterator.tmp_dir (str or NoneType, optional, default: NoneType) – The directory to write temp files. This is used if
sort=True
buffer (int, optional, default: 1000) – The number of entries to buffer and check for any normalised alleles that would, change the sort order, the larger the buffer the less chance that any allele normalisation will interfere with the output sort order, this comes at the expense of memory.
source_join_key (function or NoneType, optional, default: NoneType) – A key function to use to extract the data values from each row of the input file that will be used to join to the corresponding data values in the mapping file (provided by the
mapping_join_key
). This should accept a row of data as a list and return a tuple of values to join on. The default (NoneType
) will join on the chromosome name as a string and the base pair position as an integer.mapping_join_key (function or NoneType, optional, default: NoneType) – A key function to use to extract the data values from each row of the mapping file that will be used to join to the corresponding data values in the input source file (provided by the
source_join_key
). This should accept a row of data as a list and return a tuple of values to join on. The default (NoneType
) will join on the chromosome name as a string and the base pair position as an integer.buffer_sort_key (function or NoneType, optional, default: NoneType) – A function to provide data values to be sorted on in the event that a mapped variant has been normalised. This attempts to ensure that the output rows are output in the correct sort order even when a potential disruptive event such as variant normalisation has occured. This should accept a
gwas_norm.variants.constants.MappingResult
and return a tuple used to sort the mapping results in the buffer. The default,NoneType
, will use a function that provides the mapping chromosome name (as a string) and the mapping (normalisaed) base pair position as an integer.**kwargs – Keyword arguments passed to the
gwas_norm.variants.mapper.BaseMapper
.
- Raises:
ValueError – If the buffer value is < 0.
- DNA_REGEX = re.compile('^[ATCGatcg]+$')#
- MAPPING_FILE_TYPE#
alias of
VcfIterator
- check_new_chr_name(new_chr_name)#
Perform some basic sort order checks of a new chromosome name.
This is called when the source file being mapped moves to the next chromosome.
- Parameters:
new_chr_name (str) – The next chromosome name.
- Raises:
IndexError – If the input file and the mapping file have differing sort orders or if the input file is not sorted correctly (i.e. the chromosome names are not grouped)
- check_norm(chr_name, start_pos, ref_allele, alt_allele=None, strand=None)#
Normalise the
- close()#
Close source and the mapping files
- extract_alt_allele(source_row)#
Extract the alternate allelle from the input row.
- Parameters:
source_row (list of str) – The source row to extract the alt allele from. It is assumed that the row has the same index possitions as a row from a VCF file.
- Returns:
alt_allele – The value for the alt allele.
- Return type:
str
- extract_nothing(source_row)#
A dummy function that returns NoneType, irrespective of what has been passed.
- Parameters:
source_row (any) – Any arguments (ignored)
- Returns:
nothing – An empty return type.
- Return type:
NoneType
- extract_positive_strand(source_row)#
Return a positive strand value. This is the default strand extraction function.
- Parameters:
source_row (list of str) – The source row this is ignored.
- Returns:
strand – The value for a positive strand
1
.- Return type:
int
- extract_strand(source_row)#
Return a positive row value.
- Parameters:
source_row (list of str) – The source row to extract the strand from. Values that are interpreted as the strand are 1, -1, + and -. NoneType values results in a default strand of 1 being returned and anything else raises a
ValueError
.- Returns:
strand – The value for a strand. If the value at that position can’t be cast to an int and is
NoneType
, then a default strand of 1 is returned, any other value.- Return type:
int
- Raises:
ValueError – If the strand value is not one of the recognised values.
- extract_var_id(source_row)#
Extract the variant identifier from the input row.
- Parameters:
source_row (list of str) – The source row to extract the variant identifier from. It is assumed that the row has the same index positions as a row from a VCF file.
- Returns:
var_id – The value for the variant identifier.
- Return type:
str
- get_buffer_sort_key()#
Get the sort key to use to sort the buffer when a normalised variant is encountered.
- Returns:
sort_key – The function that provided the key to sort on.
- Return type:
function
- get_mapping_join_key()#
Get the join key used by the mapping file.
- Returns:
join_key – A function that will provide a yuple from an input row or a tuple of tuples with (input row index, datatype) that will be used to query the input row for the join keys
- Return type:
function or tuple
- static get_mapping_var_id(mapping)#
Extract the variant identifier from a mapping, the variant identifier will be in the map_row (currently at index 2).
- Parameters:
mapping (list) – This has the source variant at
[0]
, mapping variant at[1]
, the mapping row at[2]
and thedata_bits
(map_bits
) at[3]
.- Returns:
var_id – The variant identifier, this baseclass version will return
''
- Return type:
str
- get_source_join_key()#
Get the join key used by the source file that will be mapped.
- Returns:
join_key – A function that will provide a yuple from an input row or a tuple of tuples with (input row index, datatype) that will be used to query the input row for the join keys
- Return type:
function or tuple
- property header#
Return the header row of the input source file (str).
- init_mapping_file()#
Initialise the mapping file that will be scanned through using a join with the source file
- init_source_file()#
Initialise the source file that will be mapped.
- map_variant(join_row)#
Perform mapping on rows returned from a join scan against the mapping file.
In this scenario there could be multiple rows from the source file being mapped matched against multiple potential mappings. If there are multiple source_rows then they are all mapped and stored.
- Parameters:
join_row (list or list) – The rows from the source/mapping file that are matched on chromosome name and start position. The source rows will be in a list at element [0] and the mapping rows will be represented as a list at element [1]. There maybe multiple source rows and multiple mapping rows. So the source rows are treated independently and mapped to the mapping rows.
- classmethod mapping_correct(mapping_file)#
Determine if the mapping file is the correct type.
- Parameters:
mapping_file (Any) – The mapping file object to test.
- Raises:
TypeError – If the mapping file is not the correct type.
- open()#
Initialise the source and mapping files
- property output_sorted#
Return the output is sorted value (bool).
Notes
A flag indicating if the output file is sorted or not, if this is
False
then it means that an allele has been normalised in such a away that it puts it at the top of the sort order in the buffer. This may mean that the output requires sorting as the normalised value may be further up the sort order but that has already been output. If this isTrue
, then everything should be ok. This value should be checked post mapping run.
- sort_buffer(mapping_data)#
Perform a full sort on the buffer.
- Parameters:
mapping_data (variants.constants.MappingResult) – A mapping to check for sort order against the mapping buffer.
Notes
This will add the mapping data to the buffer and then perform the sort. The key from the source file JoinIterator is used to perform the sort.
- property source#
Get the input source.
- test_sort_order(mapping_data)#
Test the mapping data against the buffer to determine if the sort order is correct.
- Parameters:
mapping_data (variants.constants.MappingResult) – A mapping to check for sort order against the mapping buffer.
- Returns:
is_sorted – An indicator of the sort order status.
True
everything is the correct sort orderFalse
, then the mapping result belongs up the sort order somewhere.- Return type:
bool
- Raises:
IndexError – If the buffer does not contain enough entries to test.
Notes
The key from the source file JoinIterator is used to perform the sort.
- validate_resolver()#
Make sure the resolver is compatible with the data source for the mapper
- class gwas_norm.variants.mapper.TabixVcfVariantMapper(mapping_vcf_file, *args, **kwargs)#
Bases:
BaseMapper
,_BaseVcfInterface
A variant mapper that localises variant coordinates based on tabix queries of VCF files. This class operates on a Gwas Norm VCF mapping file.
- Parameters:
mapping_vcf_file (str) – The path to the mapping VCF file
mapping_vcf_idx (str or NoneType, optional, default:
NoneType
) – The path to the mapping VCF index. If this isNoneType
**kwargs – Arguments to the mapper.BaseMapper
- close()#
Close the tabix VCF file
- classmethod get_mapping_var_id(mapping)#
Extract the variant identifier from a mapping, the variant identifier will be in the map_row (currently at index 2).
- Parameters:
mapping (list) – This has the source variant at
[0]
, mapping variant at[1]
, the mapping row at[2]
and thedata_bits
(map_bits
) at[3]
.- Returns:
var_id – The variant identifier, this baseclass version will return
''
- Return type:
str
- map_variant(chr_name, start_pos, ref_allele, alt_allele=None, strand=1, allele_norm=True, existing_flags=0, var_id=None, input_row=None, **kwargs)#
Attempt to map a variant by localising it’s chr:pos based on a tabix query.
- Parameters:
chr_name (str) – The chromosome name of the variant
start_pos (int) – The 1-based start position for the variant
ref_allele (str) – The reference allele for the variant, allowed values are
ATCGatcg
or-
for a deletionalt_allele (str, optional, default:
NoneType
) – The alternate allele, allowed values areATCGatcg
or-
for a deletion orNoneType
strand (int, optional, default:
1
) – The strand for the variant (if known), should be 1 for positive/forward strand or -1 for negative/reverse strand.allele_norm (bool, optional, default: True) – In the event of no mapping, do you want to allele normalise and attempt to map again (if normalisation has occurred).
existing_flags (int, optional, default: 0) – Any existing mapping flags that you want to pass through to the final mapping. The end user should not need to touch this and it is mainly meant for recursive calls or for subclasses to use.
var_id (str or NoneType, optional, default: NoneType) – Any existing identifiers for the variant. This will be passed to the resolution methods in the event of no mapping.
*kwargs – Any keyword arguments passed to
best_mapping
.
- open()#
Open the tabix VCF file. Note that the file is not opened as a VCF file.
- class gwas_norm.variants.mapper.VcfIterator(mapping_vcf_file)#
Bases:
_BaseVcfInterface
An iterator that uses pysam to iterate through a VCF file from start to finish and provide rows of information as lists.
- close()#
Close the tabix VCF file
- Returns:
self – For chaining.
- Return type:
gwas_norm.variants.mapper.VcfIterator
- open()#
Open the tabix VCF file. Note that the file is not opened as a VCF file.
- Returns:
self – For chaining.
- Return type:
gwas_norm.variants.mapper.VcfIterator
- gwas_norm.variants.mapper.allele_idx(alleles, all_alleles)#
Create a partition of
all_alleles
into the alleles that are present inalleles
and those that are absent fromalleles
.- Parameters:
- Returns:
present (list of tuple) – The number of tuples reflects how many are present in all_alleles. Each tuple has the allele at [0] and it’s index position in all_alleles at [1]. This list will be empty if no alleles are present.
not present (list of tuple) – The number of tuples reflects how many are absent in all_alleles. Each tuple has the allele at [0] and it’s index position in all_alleles at [1]. This list will be empty if all alleles are present.
- gwas_norm.variants.mapper.get_mapping_coords(mapping_result)#
Extract the coordinates from a mapped variant, if there is no mapping then the source coordinates are used.
- Parameters:
mapping_result (gwas_norm.variants.constants.MappingResult) – The mappping result to extract from.
- Returns:
chr_name (str) – The chromosom name.
start_pos (int) – The start position
- gwas_norm.variants.mapper.map_data_frame(df, mapper, chr_name='chr_name', start_pos='start_pos', ref_allele='ref_allele', alt_allele='alt_allele', strand='strand', decode_map_info=False)#
Perform variant mapping on a pandas.DataFrame.
- Parameters:
df (pandas.DataFrame) – The source data frame to map.
mapper (gwas_norm.variants.mapper.EnsemblMapper or gwas_norm.variants.mapper.EnsemblMapper) – The mapper to perform the mapping against the data frame.
chr_name (str, optional, default: chr_name) – The name of the chromosome column in the pandas.DataFrame being mapped.
start_pos (str, optional, default: start_pos) – The name of the start position column in the pandas.DataFrame being mapped.
ref_allele (str, optional, default: ref_allele) – The name of the start position column in the pandas.DataFrame being mapped.
alt_allele (str, optional, default: alt_allele) – The name of the alternate allele column in the pandas.DataFrame being mapped. If it is the default value and does not exist then it is assumed that the alt allele is missing. If it is not the default value and is missing then it is treated as an error.
strand (str, optional, default: strand) – The name of the strand column in the pandas.DataFrame being mapped. If it is the default value and does not exist then it is assumed that the strand is missing and it will be created and assumed that the strand is 1. If it is not the default value and is missing then it is treated as an error.
- Returns:
mapped_df – The mapped pandas.DataFrame, note that this is a copy of the original with the mapped columns added
- Return type:
pandas.DataFrame
- gwas_norm.variants.mapper.return_none(*args, **kwargs)#
A dummy function that will accept any arguments and return
NoneType
- gwas_norm.variants.mapper.reverse_complement(dna)#
reverse complement a DNA string
- Parameters:
dna (str) – The DNA string to reverse complement
- Returns:
rc_dna – The DNA string that has been reverse complemented
- Return type:
str
Notes
This is case insensitive and only handles ATCGatcg. Any non-DNA letters are passed through and reversed but will not raise any errors.
- gwas_norm.variants.mapper.split_alts(ref, alts)#
Generate individual bi-alleilic ref/alt pairs from a ref alt set where the alts are potentially comma separated.
This can handle internal and trailing white space as well.
- Parameters:
ref (str) – The reference allele, this will still be the reference in each bi-allelic pair after the alt split.
alts (str) – Alternate alleles that should be delimited with a ,.
- Returns:
biallelic – The bi-allelic splits of the alt alleles. If there is only a single alt allele then this list will only contain a single element
- Return type:
list of tuple
- gwas_norm.variants.mapper.variant_type(ref, alt)#
Define the variant type, i.e. balanced polymorphism, insertion or deletion. Also, if the variant is in Ensembl format.
- Parameters:
ref (str) – The reference allele. Allowed values are
ATCGatcg-
.alt (str) – The alternate allele. Allowed values are
ATCGatcg-
.
- Returns:
variant_type (int) – The types are defined in constants within the mapper module.
is_ensembl (bool) – True if the alleles appear in Ensembl format False if not.
- Raises:
ValueError – If the variant type can’t be determined or if
ref
andalt
are the same.
Notes
This only works for by bi-allelic variants. Also, it does not do any checking of the actual sequence, other than looking for a
-
, to indicate a Ensembl format INDEL.
- gwas_norm.variants.mapper.vcf_to_ensembl(pos, ref, alts, dbcoords=False)#
Convert VCF coordinates and alleles to ensembl ones, this assumes that everything is on the + strand. See the VEP documentation on the differences between how INDELs are defined in Ensembl and VCF files.
- Parameters:
pos (int) – The VCF POS column
ref (str) – The VCF REF column
alts (list of str) – The VCF alternate alleles column
dbcoords (bool, optional, default:
False
) – Adjust INDEL coordinates to match the Ensembl database. In the case of INSERTIONS this means start>end. IfFalse
then start == end (as in the Ensembl GVF files).
- Raises:
ValueError – If the alleles are not DNA or if the alleles are already ensembl format
gwas_norm.variants.resolvers
#
classes to handle the resolution of poor mappings and for the extraction of variant metadata from mapping files.
- class gwas_norm.variants.resolvers.BaseResolver(*args, **kwargs)#
Bases:
object
The base resolver class. Do not use directly
- Parameters:
*args – Any arguments (ignored)
**kwargs – Any keyword arguments (ignored)
Notes
The idea behind a resolver class is to handle situations where a variant localises but can’t be mapped. So, the user can define their own methods for dealing with variant resolution using data extracted from the localisation source. Two methods should be implemented:
resolve_poor_mapping
andimpute_alt_allele
. The base class versions of these simply return a no mapping.- DEFAULT_INTERNAL_DELIMITER = '|'#
The internal delimiter value for flattened data string when MappingFileResolver.extract_summary_metadata_row is called (str)
- METADATA_SUMMARY_ROW_HEADER = ['chr_name_mapper', 'start_pos_mapper', 'strand_mapper', 'ref_allele_mapper', 'alt_allele_mapper', 'nsites', 'map_info']#
The header column names that accompany can accompany the data returned by the MappingFileResolver.extract_summary_metadata_row (list of str).
- MIN_ALT_IMPUTE_EVIDENCE = 11776#
The minimal amount of evidence that a variant must have in order to attempt alt allele imputation (int)
- extract_metadata(row)#
Extract the required metadata from a mapped row.
- Parameters:
row (pysam.VariantRecord) – A variant record with the populations (samples) and metadata (info) that is expected in a mapping file.
- Returns:
metadata – The extracted metadata
- Return type:
dict
Notes
The assumption here is that the row is derived from a VCF file that has the population allele numbers and counts in the sample sections and a VEP annotation in the iNFO field. Also, all variants should be represented as bi-allelic.
- classmethod extract_summary_metadata_row(mapping, meta, *args, decode_map_info=False, **kwargs)#
Helper method to extract summary information as a list that can be written to file.
- Parameters:
mapping (gwas_norm.variants.constants.MappingResult) – A named tuple with the following fields source_coords, mapping_coords, errors, mapping_bits.
mapping – The mapping result to provide the mapped coordinates.
meta (dict) – The extracted metadata information from the mapping variant. i.e. the result of calling
obj.extract_metadata()
.*args – Any other positional arguments (currently ignored)
decode_map_info (bool, optional, default: False) – Should the map info be decoded into a delimited string or remain as an encoded bitwise integer.
**kwargs – Any other keyword arguments (currently ignored)
- Returns:
outrow – Summary mapping information that can be written to a flat csv file. The order of the columns are the same as MappingFileResolver.METADATA_SUMMARY_ROW_HEADER
- Return type:
list
Notes
This baseclass version extracts the mapping coordinate information and the mapping bits (or decoded string depending on decoding mapping flags). This can be overridden to provide different information if needed.
- classmethod get_mapping_error(source_coords, error=None, nsites=0, input_row=None)#
return a mapping with ERROR bits and the associated error (if available).
- Parameters:
error (Exception) – An exception to add to the error mapping.
- Returns:
error_mapping – A mapping result with mapper.ERROR.bits
- Return type:
mapper.MappingResult
- classmethod get_no_data_mapping(source_coords, nsites=0, input_row=None)#
return an empty no data mapping
- Returns:
no_data_mapping – A mapping result with mapper.vc.NO_DATA.bits
- Return type:
mapper.MappingResult
- impute_alt_allele(mappings, input_row=None)#
Attempt to assign an alternate allele based on data from all the mappings.
- Parameters:
mappings (list of list) – The mapping_rows aligned with the source data in order from the mapping row with the best match to the source data to the mapping row with the worst match to the source data. Each tuple should have the structure of source coordinates (gwas_norm.variants.constants.MapCoords). mapping coords (gwas_norm.variants.constants.MapCoords), mapping bits and mapping row (the matching row from the mapping data source)
input_row (Any, optional, default: NoneType) – Mainly for passing through to any resolved mapped variants. This will be added to the
source_row
attribute.
- Returns:
no_data_mapping – The BaseMapper implementation returns a mapping result with gwas_norm.variants.constants.NO_DATA.bits.
- Return type:
mapper.MappingResult
Notes
This offers the option to resolve poor mappings using any available metadata in
mappings
.
- resolve_poor_mapping(mappings, input_row=None)#
A method that is called in the case when there are no high quality mappings. In reality there is probably not much to be done but this offers the option to resolve poor mappings using any available metadata in
mappings
.- Parameters:
mappings (list of list) – The mapping_rows aligned with the source data in order from the mapping row with the best match to the source data to the mapping row with the worst match to the source data. Each tuple should have the structure of source coordinates (gwas_norm.variants.constants.MapCoords). mapping coords (gwas_norm.variants.constants.MapCoords), mapping bits and mapping row (the matching row from the mapping data source)
input_row (Any, optional, default: NoneType) – Mainly for passing through to any resolved mapped variants. This will be added to the
source_row
attribute.var_id (str or NoneType, optional, default: NoneType) – Any existing identifiers for the variant. This will be passed to the resolution methods in the event of no mapping.
- Returns:
no_data_mapping – The BaseMapper implementation returns a mapping result with gwas_norm.variants.constants.NO_DATA.bits
- Return type:
mapper.MappingResult
- static sort_map(mapping)#
A sort key method for ordering the mapping rows.
- Parameters:
mapping (tuple) – The mapping bits should be at element [3].
- validate_data_source(data_source)#
A method that can be used by the resolver to determine if the expected information is present in the data source.
- Parameters:
data_source (any) – A data source to validate.
Notes
For example it can be used to determine if certain expected fields are present within a VCF header.
- class gwas_norm.variants.resolvers.EnsemblResolver(rest_client, *args, species='homo_sapiens', cache_size=10, **kwargs)#
Bases:
PopulationResolver
The resolver class for use with the gwas_norm.variants.mapper.EnsemblVariantMapper.
- Parameters:
rest_client (ensembl_rest_client.client.Rest) – An object for interacting with the Ensembl REST API.
*args – Any arguments (ignored)
**kwargs – Any keyword arguments (ignored)
Notes
This handles variant mapping resolution (currently none) and alt allele imputation (currently none) for data derived the Ensembl REST API.
- METADATA_SUMMARY_ROW_HEADER = ['chr_name_mapper', 'start_pos_mapper', 'strand_mapper', 'ref_allele_mapper', 'alt_allele_mapper', 'nsites', 'map_info', 'alt_allele_freq', 'used_pops', 'var_id', 'worst_consequence', 'worst_clinvar']#
The header column names that accompany can accompany the data returned by the MappingFileResolver.extract_summary_metadata_row (list of str).
- property cache#
- extract_metadata(mapping)#
Extract the required metadata from a mapped row.
- Parameters:
mapping (gwas_norm.variants.constants.MappingResult) – The mapping result to extract the metadata from.
- Returns:
metadata – The extracted metadata
- Return type:
dict
Notes
Currently this just performs some mapping of the VEP worst consequences and ClinVar significance to constants defined in the gwas_norm.variants.vcf_info module. However, if needed, you can override this to perform additional REST queries to gather more info on the variant.
- classmethod extract_summary_metadata_row(mapping, meta, *args, decode_map_info=False, **kwargs)#
Helper method to extract summary information as a list that can be written to file.
- Parameters:
mapping (gwas_norm.variants.constants.MappingResult) – A named tuple with the following fields source_coords, mapping_coords, errors, mapping_bits.
mapping – The mapping result to provide the mapped coordinates.
meta (dict) – The extracted metadata information from the mapping variant. i.e. the result of calling
obj.extract_metadata()
.*args – Any other positional arguments (currently ignored)
decode_map_info (bool, optional, default: False) – Should the map info be decoded into a delimited string or remain as an encoded bitwise integer.
**kwargs – Any other keyword arguments (currently ignored)
- Returns:
outrow – Summary mapping information that can be written to a flat csv file. The order of the columns are the same as EnsemblResolver.METADATA_SUMMARY_ROW_HEADER
- Return type:
list
Notes
This can be overridden to provide different information if needed. Currently, it expects the metadata dict to have a key called
clinical_significance
which should contain a list of gwas_norm.variants.vcf_info.ClinVarSig namedtuples and a keyconsequence_type
that should contain a gwas_norm.variants.vcf_info.So namedtuple.
- get_alt_allele_freq(chr_name, start_pos, ref_allele, alt_allele, strand, var_id)#
Get the alt allele frequency according to the population specification.
Notes
This will attempt to get the alt allele frequency from the internal cache first before issuing a query if it is not present. It will also store the result in the internal cache.
- impute_alt_allele(mappings, input_row=None)#
Attempt to assign an alternate allele based on data from all the mappings.
- Parameters:
mappings (list of list) – The mapping_rows aligned with the source data in order from the mapping row with the best match to the source data to the mapping row with the worst match to the source data. Each tuple should have the structure of source coordinates (gwas_norm.variants.constants.MapCoords). mapping coords (gwas_norm.variants.constants.MapCoords), mapping bits and mapping row (the matching row from the mapping data source).
input_row (Any, optional, default: NoneType) – Mainly for passing through to any resolved mapped variants. This will be added to the
source_row
attribute.
- Returns:
no_data_mapping – The BaseMapper implementation returns a mapping result with gwas_norm.variants.constants.NO_DATA.bits.
- Return type:
mapper.MappingResult
Notes
This offers the option to impute the alternate allele when only one allele has been provided to the mapper, this works as follows:
If there is only a single mapping in
mappings
then it is assumed that this is the only possibility for the mapping and that is returned.If
mappings
has > 1 mapping, then the minor allele frequency of each of the requested populations is queried and calculated. Then the mapping with the highest maf is returned as long as no other mappings have a maf >= unsafe_alt_infer. (provided to__init__
). If they do then a no mapping is returned- If only 1 mapping has any population data then it is assumed that
is the correct one.
All mappings that are returned from this method will be tagged with gwas_norm.variants.constants.ALT_INFERRED.bits.
If this default behaviour is not what you desire then you should sub-class this resolver and override this method to do exactly what you want.
- list_populations()#
- query_allele_freq(var_id)#
Query out all the available allele frequencies for a variant identifier
- Parameters:
var_id (str) – A variant identifier, typically this is an rsID.
- resolve_poor_mapping(mappings, input_row=None)#
A method that is called in the case when there are no high quality mappings. In reality there is probably not much to be done but this offers the option to resolve poor mappings using any available metadata in
mappings
.- Parameters:
mappings (list of list) – The mapping_rows aligned with the source data in order from the mapping row with the best match to the source data to the mapping row with the worst match to the source data. Each tuple should have the structure of source coordinates (gwas_norm.variants.constants.MapCoords). mapping coords (gwas_norm.variants.constants.MapCoords), mapping bits and mapping row (the matching row from the mapping data source).
input_row (Any, optional, default: NoneType) – Mainly for passing through to any resolved mapped variants. This will be added to the
source_row
attribute.
- Returns:
no_data_mapping – The BaseMapper implementation returns a mapping result with gwas_norm.variants.constants.NO_DATA.bits
- Return type:
mapper.MappingResult
- validate_data_source(*args)#
A method that can be used by the resolver to determine if the expected information is present in the data source.
- Parameters:
data_source (any) – A data source to validate.
Notes
For example it can be used to determine if certain expected fields are present within a VCF header.
- validate_populations()#
- class gwas_norm.variants.resolvers.MappingFileResolver(*args, **kwargs)#
Bases:
PopulationResolver
A resolver for use with gwas_norm mapping files. See here
- Parameters:
*args – Any arguments (ignored)
populations (NoneType or list of str or tuple, optional, default: NoneType) – One or more populations that are specified in the mapping VCF file. These are located where the sample fields are and the row format for them should be ‘AN:AC’. If this is
NoneType
then it is assumed that all populations that have been specified in the mapping file should be used. If the list contains strings, these should match the population names (sample names) in the mapping VCF file (valid forallele_freq_method='mean'
andallele_freq_method='hierarchy'
). If the list contains tuples, there are several options. If the tuple has two elements and the first is a string (population name) with the second being a float between 0-1 (weight for the population) (valid forallele_freq_method='mean'
), then this will give a weighted allele alternate frequency. If the tuple contains a tuple of strings at[0]
and a float between 0-1 (weight for the population) then this is valid forallele_freq_method='hierarchy'
and the first available population allele frequency is used to calculate a weighted alternate allele frequency.allele_freq_method (str, optional, default: mean) – The method that is used to determine the alternate allele frequency. can be either, ‘mean’, ‘hierarchy’. If the user want to override this class and add more then they should add them to are in a class variable gwas_norm.variants.resolvers.MappingFileResolver.ALLOWED_ALLELE_FREQ`
unsafe_alt_infer (float, optional, default: 0.05) – In the case when the ALT allele is not present, it will attempt to be inferred from matches based on
chr_name
,start_pos
,ref_allele
. If there are multiple possible matches, then the match with the greatest MAF is selected (based on the populations requested). However, if > 1 population has a MAF >= than this value then the ALT allele is not inferred and no mapping is returned. This must be a frequency between 0-1. This is designed to handle multiple common ALT choices.**kwargs – Any keyword arguments to the base class
Notes
The idea behind a resolver class is to handle situations where a variant localises but can’t be mapped. So, the user can define their own methods for dealing with variant resolution using data extracted from the localisation source. Two methods should be implemented:
resolve_poor_mapping
andimpute_alt_allele
. The base class versions of these simply return a no mapping.- METADATA_SUMMARY_ROW_HEADER = ['chr_name_mapper', 'start_pos_mapper', 'strand_mapper', 'ref_allele_mapper', 'alt_allele_mapper', 'nsites', 'map_info', 'alt_allele_freq', 'used_pops', 'var_id', 'worst_consequence', 'worst_clinvar', 'cadd_raw', 'cadd_phred', 'sift', 'polyphen', 'datasets']#
The header column names that accompany can accompany the data returned by the MappingFileResolver.extract_summary_metadata_row (list of str).
- extract_cadd(row)#
Extract the cadd annotations from the row.
- Parameters:
row (pysam.VariantRecord) – A record derived from a mapping file.
- Returns:
var_id – The variant identifier, if not available will be a .
- Return type:
str
- static extract_datasets(row)#
Extract the datasets that the variant has been found in.
- Parameters:
row (pysam.VariantRecord) – A record derived from a mapping file.
- Returns:
datasets – The datasets that contain this variant. If this field is not available then
NoneType
is returned.- Return type:
tuple or NoneType
- static extract_id(row)#
Extract the variant ID from the row.
- Parameters:
row (pysam.VariantRecord) – A record derived from a mapping file.
- Returns:
var_id – The variant identifier, if not available will be a .
- Return type:
str
- extract_metadata(mapping)#
Extract the required metadata from a mapped row.
- Parameters:
row (pysam.VariantRecord) – A variant record with the populations (samples) and metadata (info) that is expected in a mapping file.
- Returns:
metadata – The extracted metadata
- Return type:
dict
Notes
The assumption here is that the row is derived from a VCF file that has the population allele numbers and counts in the sample sections and a VEP annotation in the iNFO field. Also, all variants should be represented as bi-allelic.
- static extract_pops(row)#
Extract the population data for a variant.
- Parameters:
row (pysam.VariantRecord) – A record derived from a mapping file.
- Returns:
variant_pops – This has the population name as a keys and a tuple of (allele number int, allele count int) as values. If any data is missing for the population the allele number and allele count values will be
NoneType
.- Return type:
dict
- classmethod extract_summary_metadata_row(mapping, meta, *args, decode_map_info=False, **kwargs)#
Helper method to extract summary information as a list that can be written to file.
- Parameters:
mapping (MappingResult) – A named tuple with the following fields source_coords, mapping_coords, errors, mapping_bits.
meta (dict) – The extracted metadata information from the mapping variant.
decode_map_info (bool, optional, default: False) – Should the map info be decoded into a delimited string or remain as an encoded bitwise integer.
- Returns:
outrow – Summary mapping information that can be written to a flat csv file. The order of the columns are the same as MappingFileResolver.METADATA_SUMMARY_ROW_HEADER
- Return type:
list
Notes
This can be overridden to provide different information if needed.
- extract_vep(row)#
Extract the vep annotations from the row.
- Parameters:
row (pysam.VariantRecord) – A record derived from a mapping file.
- Returns:
var_id – The variant identifier, if not available will be a .
- Return type:
str
- impute_alt_allele(mappings, input_row=None)#
Attempt to assign an alternate allele based on data from all the mappings.
- Parameters:
mappings (list of list) – The mapping_rows aligned with the source data in order from the mapping row with the best match to the source data to the mapping row with the worst match to the source data. Each tuple should have the structure of source coordinates (gwas_norm.variants.constants.MapCoords). mapping coords (gwas_norm.variants.constants.MapCoords), mapping bits and mapping row (the matching row from the mapping data source).
input_row (Any, optional, default: NoneType) – Mainly for passing through to any resolved mapped variants. This will be added to the
source_row
attribute.
- Returns:
no_data_mapping – The BaseMapper implementation returns a mapping result with gwas_norm.variants.constants.NO_DATA.bits.
- Return type:
mapper.MappingResult
Notes
This offers the option to impute the alternate allele when only one allele has been provided to the mapper, this works as follows:
If there is only a single mapping in
mappings
then it is assumed that this is the only possibility for the mapping and that is returned.If
mappings
has > 1 mapping, then the minor allele frequency of each of the requested populations is queried and calculated. Then the mapping with the highest maf is returned as long as no other mappings have a maf >= unsafe_alt_infer. (provided to__init__
). If they do then a no mapping is returned- If only 1 mapping has any population data then it is assumed that
is the correct one.
All mappings that are returned from this method will be tagged with gwas_norm.variants.constants.ALT_INFERRED.bits.
If this default behaviour is not what you desire then you should sub-class this resolver and override this method to do exactly what you want.
- list_populations()#
- static validate_cadd_format(header)#
Validate the CADD (CADD) INFO field in the header to make sure it is appropriate for the mapping file.
- Parameters:
header (pysam.VariantHeader) – A pysam VCF file header to extract the info fields from
- validate_data_source(parser)#
A method that can be used by the resolver to determine if the expected information is present in the data source.
- Parameters:
data_source (any) – A data source to validate.
Notes
For example it can be used to determine if certain expected fields are present within a VCF header.
- static validate_header_format(header)#
Validate the format field in the header to make sure it is appropriate for the mapping file.
- Parameters:
header (pysam.VariantHeader) – A pysam VCF file header to extract the info fields from
- classmethod validate_population_kwarg(populations, allele_freq_method)#
Validate the populations that have been given to the object.
- Parameters:
populations (list of str or tuple, optional, default: NoneType) – One or more populations that are specified in the mapping VCF file. If the list contains strings, these should match the population names (sample names) in the mapping VCF file (valid for
allele_freq_method='mean'
andallele_freq_method='hierarchy'
). If the list contains tuples, there are several options. If the tuple has two elements and the first is a string (population name) with the second being a float between 0-1 (weight for the population) (valid forallele_freq_method='mean'
), then this will give a weighted allele alternate frequency. If the tuple contains a tuple of strings at[0]
and a float between 0-1 (weight for the population) then this is valid forallele_freq_method='hierarchy'
and the first available population allele frequency is used to calculate a weighted alternate allele frequency.allele_freq_method (str) – The method that is used to determine the alternate allele frequency. can be either, ‘mean’, ‘hierarchy’.
- Returns:
valid_populations – Where either the tuple is valid for the mean allele frequency method or the hierarchical allele frequency method.
- Return type:
list of tuple
- Raises:
ValueError – If there are no populations to evaluate or the
allele_freq_method
is unknown.TypeError – If there are any issues with the population data format
- validate_populations(header)#
Validate the population (sample) fields in the header and make sure that any requested populations are contained within them.
This also sets the requested populations to all available populations if they have not been set in the constructor.
- Parameters:
header (pysam.VariantHeader) – A pysam VCF file header to extract the info fields from
- static validate_vep_format(header)#
Validate the vep (CSQ) INFO field in the header to make sure it is appropriate for the mapping file.
- Parameters:
header (pysam.VariantHeader) – A pysam VCF file header to extract the info fields from
- class gwas_norm.variants.resolvers.PopulationResolver(*args, populations=None, allele_freq_method='mean', unsafe_alt_infer=0.05, freq_data_source=False, **kwargs)#
Bases:
BaseResolver
A resolver class that handles some population arguments for specifying groups of populations that can be used to gather allele frequency info
- ALLOWED_ALLELE_FREQ = ['mean', 'hierarchy']#
Allowed allele frequency methods (list of `str)
- HIER_AAF = 'hierarchy'#
Keyword for hierarchical alternate allele frequency method (str)
- MEAN_AAF = 'mean'#
Keyword for mean alternate allele frequency method (str)
- METADATA_SUMMARY_ROW_HEADER = ['chr_name_mapper', 'start_pos_mapper', 'strand_mapper', 'ref_allele_mapper', 'alt_allele_mapper', 'nsites', 'map_info', 'alt_allele_freq', 'used_pops']#
The header column names that accompany can accompany the data returned by the MappingFileResolver.extract_summary_metadata_row (list of str).
- static extract_hierarchy_aaf_counts(pop_weights, variant_pops)#
This performs a hierarchical population alternate allele frequency calculation.
- Parameters:
pop_weights (list of tuple) – Each tuple should contain a tuple of population names at
[0]
with the most favoured populations nearer the start of the tuple (population group) and a weight for the population group at 1. All the weights across the population groups should add up to 1 (this is not checked here but is checked in the validation functions) variant_pops.variant_pops (dict) – This has the population name as a keys and a tuple of (allele number int, allele count int) as values. If any data is missing for the population the allele number and allele count values will be
NoneType
.
- Returns:
alt_allele_freq – The frequency of the alternate allele. If no allele frequencies are available for the sample then NoneType is returned.
- Return type:
float or NoneType
Notes
The hierarchical method works as follows. The idea is that there are some populations that you will want to preferentially take allele counts from (i.e. with the highest sample size). However, maybe there is no data for the favoured population so fallback populations can be supplied. If none of the populations suffice then NoneType is the fallback. So, this method is designed to return allele frequency data in as many cases as possible. This method can accept several population hierarchy groups with weights for each group. So you could have a group of European ancestry population and a group of South Asian ancestry populations and calculate a weighted alternate allele frequency with 0.75 European and 0.25 South Asian. Note that this does not check the format of
pop_weights
so ensure that gwas_norm.variants.resolvers.PopulationResolver.validate_population_kwarg is called first.
- static extract_hierarchy_aaf_freq(pop_weights, variant_pops)#
This performs a hierarchical population alternate allele frequency calculation.
- Parameters:
pop_weights (list of tuple) – Each tuple should contain a tuple of population names at
[0]
with the most favoured populations nearer the start of the tuple (population group) and a weight for the population group at 1. All the weights across the population groups should add up to 1 (this is not checked here but is checked in the validation functions) variant_pops.variant_pops (dict) – This has the population name as a keys and a tuple of (allele number int, allele count int) as values. If any data is missing for the population the allele number and allele count values will be
NoneType
.
- Returns:
alt_allele_freq – The frequency of the alternate allele. If no allele frequencies are available for the sample then NoneType is returned.
- Return type:
float or NoneType
Notes
The hierarchical method works as follows. The idea is that there are some populations that you will want to preferentially take allele counts from (i.e. with the highest sample size). However, maybe there is no data for the favoured population so fallback populations can be supplied. If none of the populations suffice then NoneType is the fallback. So, this method is designed to return allele frequency data in as many cases as possible. This method can accept several population hierarchy groups with weights for each group. So you could have a group of European ancestry population and a group of South Asian ancestry populations and calculate a weighted alternate allele frequency with 0.75 European and 0.25 South Asian. Note that this does not check the format of
pop_weights
so ensure that gwas_norm.variants.resolvers.PopulationResolver.validate_population_kwarg is called first.
- static extract_mean_aaf_counts(pop_weights, variant_pops)#
This performs a (weighted) mean allele frequency calculation across all the supplied populations.
- Parameters:
pop_weights (list of tuple) – Each tuple should contain a population name at
[0]
(str) and a weight for the population at 1. All the weights across the named populations groups should add up to 1 (this is not checked here but is checked in the validation functions)variant_pops (dict) – This has the population name as a keys and a tuple of (allele number int, allele count int) as values. If any data is missing for the population the allele number and allele count values will be
NoneType
.
- Returns:
alt_allele_freq – The frequency of the alternate allele. If no allele frequencies are available for the sample then
NoneType
is returned.- Return type:
float or NoneType
Notes
This will calculate the weighted mean of the alternate allele frequency across the supplied populations. If any of the populations lack data then
NoneType
is returned. Note that this does not check the format ofpop_weights
so ensure that gwas_norm.variants.resolvers.PopulationResolver.validate_population_kwarg is called first.
- static extract_mean_aaf_freq(pop_weights, variant_pops)#
This performs a (weighted) mean allele frequency calculation across all the supplied populations.
- Parameters:
pop_weights (list of tuple) – Each tuple should contain a population name at
[0]
(str) and a weight for the population at 1. All the weights across the named populations groups should add up to 1 (this is not checked here but is checked in the validation functions)variant_pops (dict) – This has the population name as a keys and a tuple of (allele number int, allele count int) as values. If any data is missing for the population the allele number and allele count values will be
NoneType
.
- Returns:
alt_allele_freq – The frequency of the alternate allele. If no allele frequencies are available for the sample then
NoneType
is returned.- Return type:
float or NoneType
Notes
This will calculate the weighted mean of the alternate allele frequency across the supplied populations. If any of the populations lack data then
NoneType
is returned. Note that this does not check the format ofpop_weights
so ensure that gwas_norm.variants.resolvers.PopulationResolver.validate_population_kwarg is called first.
- static extract_no_aaf(*args, **kwargs)#
A dummy method that is called if no population data is available in the file. Under normal circumstances this should not be called.
- Parameters:
*args – Positional arguments - ignored
**kwargs – Keyword arguments - ignored
- Returns:
nothing – A non existant alternate allele frequency
- Return type:
NoneType
- list_populations()#
- classmethod validate_population_kwarg(populations, allele_freq_method)#
Validate the populations that have been given to the object.
- Parameters:
populations (list of str or tuple) – One or more populations that are specified in the data source. If the list contains strings, these should match the population names (sample names) in the data source (valid for
allele_freq_method='mean'
andallele_freq_method='hierarchy'
). If the list contains tuples, there are several options. If the tuple has two elements and the first is a string (population name) with the second being a float between 0-1 (weight for the population) (valid forallele_freq_method='mean'
), then this will give a weighted allele alternate frequency. If the tuple contains a tuple of strings at[0]
and a float between 0-1 (weight for the population) then this is valid forallele_freq_method='hierarchy'
and the first available population allele frequency is used to calculate a weighted alternate allele frequency.allele_freq_method (str) – The method that is used to determine the alternate allele frequency. can be either, ‘mean’, ‘hierarchy’.
- Returns:
valid_populations – Where either the tuple is valid for the mean allele frequency method or the hierarchical allele frequency method.
- Return type:
list of tuple
- Raises:
ValueError – If there are no populations to evaluate or the
allele_freq_method
is unknown.TypeError – If there are any issues with the population data format
gwas_norm.variants.constants
#
A load of common variables and regular expressions that can be used in ensembl related activities
- gwas_norm.variants.constants.ALLELE_DELIMITER#
The character used to separate ref and alt alleles whenever an allele string is built
- Type:
- gwas_norm.variants.constants.ALLOWED_TYPES#
The variants types the keys are type names used internally and the values are those that the user will see. A variant type is INSERTION, DELETION, SNP etc….
- Type:
dict
of str
- gwas_norm.variants.constants.BLANK_ALLELE#
The character used to represent a blank allele in an INDEL
- Type:
- gwas_norm.variants.constants.ID#
The ID bit, This is used to decode if the variant has ID information. Also used to decode mapping information
- Type:
- gwas_norm.variants.constants.CHR#
The CHR bit, This is used to decode if the variant has chromsome. Also used to decode mapping information
- Type:
- gwas_norm.variants.constants.START#
The START bit, This is used to decode if the variant has start position. Also used to decode mapping information
- Type:
- gwas_norm.variants.constants.END#
The END bit, This is used to decode if the variant has end position. Also used to decode mapping information
- Type:
- gwas_norm.variants.constants.STRAND#
The STRAND bit, This is used to decode if the variant has strand. Also used to decode mapping information
- Type:
- gwas_norm.variants.constants.REF#
The REF bit, This is used to decode if the variant has reference allele information. Also used to decode mapping information
- Type:
- gwas_norm.variants.constants.ALT#
The REF bit, This is used to decode if the variant has alternate allele(s) information. Also used to decode mapping information
- Type:
- gwas_norm.variants.constants.STRAND_FLIP#
The STRAND_FLIP bit. Used to encode if the variant has been flipped to map it to the “other” variant when map_to has been called
- Type:
- gwas_norm.variants.constants.REF_FLIP#
The STRAND_FLIP bit. Used to encode if the reference allele has been flipped to map it to the “other” variant when map_to has been called
- Type:
- gwas_norm.variants.constants.PARTIAL_ALLELE_MATCH#
The PARTIAL_ALLELE_MATCH bit. Used to encode/decode if the variant has has a partial allele match with the “other” variant when map_to is called. Note that partial allele matches only occur when there is more than 1 ALT allele or we have N-nucleotide polymorphisms such as AT/CG
- Type:
- gwas_norm.variants.constants.COORD_OFFSET#
The COORD_OFFSET bit. Used to encode/decode if either the start/end coordinate has been either psoitively or negatively offset when mapping to “other” variant in a map_to call. Note that coordinate offsets are only applied when both variants in the mapping are INDELS
- Type:
- gwas_norm.variants.constants.MAPPING_DECODE#
The bit integers involved in decoding the mapping bits
- Type:
list
of int
- gwas_norm.variants.constants.MAPPING_DECODE_STR#
Strings that represent the bit integers when decoding the mapping bits
- Type:
list
of str
- gwas_norm.variants.constants.ALT = MappingFlag(name='ALT', bits=16384, description='A flag indicating a variant has been mapped based on alternate(other) allele')#
A flag indicating a variant has been mapped based on alternate (other) allele (MappingFlag)
Note that this flag is applied after any reference allele or strand flipping during the matching process. If multiple alternate alleles are available only one has to match for this to be flagged but if all of them have not been matched then PARTIAL_ALLELE_MATCH will also be set.
- gwas_norm.variants.constants.ALT_ALLELE_INFERRED = MappingFlag(name='ALT_ALLELE_INFERRED', bits=32, description='A flag indicating the the alternate allele has been inferred for a variant')#
A flag indicating the the alternate allele has been inferred for a variant (MappingFlag)
- gwas_norm.variants.constants.BALANCED = 1#
Constant for a balanced polymorphism, i.e. SNP or things such as
AG
->``CT`` (int)
- gwas_norm.variants.constants.CHR = MappingFlag(name='CHR', bits=512, description='A flag indicating a variant has been mapped based on chromosome name')#
A flag indicating a variant has been mapped based on chromosome name (MappingFlag)
- gwas_norm.variants.constants.COMP_TRANSLATE = {45: 45, 65: 84, 67: 71, 71: 67, 84: 65, 97: 116, 99: 103, 103: 99, 116: 97}#
A DNA translation table to make complements of a DNA sequence (dict)
The keys are the ordinal values (int) of the corresponding character in the first string and the values are the ordinal values (int) of the respective translation character in the second string.
- class gwas_norm.variants.constants.Column(name, expected_idx, alias)#
Bases:
tuple
- alias#
Alias for field number 2
- expected_idx#
Alias for field number 1
- name#
Alias for field number 0
- gwas_norm.variants.constants.DELETION = 3#
Constant for a deletion polymorphism (int)
- gwas_norm.variants.constants.DNA_DEL_REGEX = re.compile('^[ATCGatcg-]+$')#
A pre-compiled regexp of constants.DNA_DEL_STR a regular expression definition that looks for DNA bases (case-insensitive) and deletions represented as a hyphen (re.pattern)
- gwas_norm.variants.constants.DNA_DEL_STR = '^[ATCGatcg-]+$'#
A a regular expression definition that looks for DNA bases (case-insensitive ) and deletions represented as a hyphen (str)
- gwas_norm.variants.constants.DNA_REGEX = re.compile('^[ATCGatcg]+$')#
A pre-compiled regexp of constants.DNA_STR a regular expression definition that looks for DNA bases (case-insensitive) (i.e. ATCGatcg) (re.pattern)
- gwas_norm.variants.constants.DNA_STR = '^[ATCGatcg]+$'#
A regular expression definition that looks for DNA bases (case-insensitive) (i.e. ATCGatcg) (re.pattern)
- class gwas_norm.variants.constants.DataSet(name, bits)#
Bases:
tuple
- bits#
Alias for field number 1
- name#
Alias for field number 0
- gwas_norm.variants.constants.END = MappingFlag(name='END', bits=4096, description='A flag indicating a variant has been mapped based on end coordinate')#
A flag indicating a variant has been mapped based on end coordinate. End position is not checked as it is inferred from ref allele matching. (MappingFlag)
- gwas_norm.variants.constants.ENSEMBL_DELETION = '-'#
The symbol that represents a deletion allele in Ensembl format (str)
- gwas_norm.variants.constants.ENS_ID_REGEX = re.compile('(ENS[GTP]\\d{11})(?:\\.\\d+)?')#
A pre-compiled regexp of constants.ENS_ID_STR a regular expression definition that looks for Ensembl (human) identifiers i.e. ENSG/ENST/ENSP/EN SEDNA bases (re.pattern)
- gwas_norm.variants.constants.ENS_ID_STR = '(ENS[GTPE]\\d{11})(?:\\.\\d+)?'#
A regular expression definition that looks for Ensembl (human) identifiers i.e. ENSG/ENST/ENSP/ENSEDNA bases (re.pattern)
- gwas_norm.variants.constants.ERROR = MappingFlag(name='ERROR', bits=1, description='A flag indicating a mapping has no data associated with it')#
A flag indicating a mapping has no data associated with it (MappingFlag)
- gwas_norm.variants.constants.FLAGS = [MappingFlag(name='ERROR', bits=1, description='A flag indicating a mapping has no data associated with it'), MappingFlag(name='ID', bits=2, description='A flag indicating a variant has been mapped based on variantidentifier'), MappingFlag(name='CHR', bits=512, description='A flag indicating a variant has been mapped based on chromosome name'), MappingFlag(name='START', bits=2048, description='A flag indicating a mapping has no data associated with it'), MappingFlag(name='END', bits=4096, description='A flag indicating a variant has been mapped based on end coordinate'), MappingFlag(name='STRAND', bits=1024, description='A flag indicating a variant has been mapped based on strand'), MappingFlag(name='REF', bits=8192, description='A flag indicating a variant has been mapped based on reference allele'), MappingFlag(name='ALT', bits=16384, description='A flag indicating a variant has been mapped based on alternate(other) allele'), MappingFlag(name='IS_PALINDROMIC', bits=256, description='A flag indicating a variant is palindromic'), MappingFlag(name='NORMALISED', bits=8, description='A flag indicating an insertion/deletion variant has normalised prior tomatching'), MappingFlag(name='REF_FLIP', bits=64, description='A flag indicating the variant reference allele has been flipped prior tomatching'), MappingFlag(name='STRAND_FLIP', bits=128, description='A flag indicating a variant strand has been flipped prior to matching'), MappingFlag(name='PARTIAL_ALLELE_MATCH', bits=16, description='A flag indicating that not all alternate alleles have been matched for avariant'), MappingFlag(name='ALT_ALLELE_INFERRED', bits=32, description='A flag indicating the the alternate allele has been inferred for a variant'), MappingFlag(name='UNKNOWN_INDEL', bits=4, description='A flag indicating that a variant has been mapped from the unknowninsertion/deletion I/D/R specification')]#
The available mapping flags (list)
- gwas_norm.variants.constants.ID = MappingFlag(name='ID', bits=2, description='A flag indicating a variant has been mapped based on variantidentifier')#
A flag indicating a variant has been mapped based on variant identifier. This is not currently used but is implemented just in case it is used in future (MappingFlag)
- gwas_norm.variants.constants.INSERTION = 2#
Constant for a insertion polymorphism (int)
- gwas_norm.variants.constants.IS_PALINDROMIC = MappingFlag(name='IS_PALINDROMIC', bits=256, description='A flag indicating a variant is palindromic')#
A flag indicating a variant is palindromic, not currently used as we do not have any special palindromic handling (MappingFlag)
- class gwas_norm.variants.constants.MapCoord(chr_name, start_pos, strand, ref_allele, alt_allele)#
Bases:
tuple
- alt_allele#
Alias for field number 4
- chr_name#
Alias for field number 0
- ref_allele#
Alias for field number 3
- start_pos#
Alias for field number 1
- strand#
Alias for field number 2
- class gwas_norm.variants.constants.MappingFlag(name, bits, description)#
Bases:
tuple
- bits#
Alias for field number 1
- description#
Alias for field number 2
- name#
Alias for field number 0
- class gwas_norm.variants.constants.MappingResult(source_coords, mapping_coords, map_bits, source_row, map_row, errors, nsites, resolver)#
Bases:
tuple
- errors#
Alias for field number 5
- map_bits#
Alias for field number 2
- map_row#
Alias for field number 4
- mapping_coords#
Alias for field number 1
- nsites#
Alias for field number 6
- resolver#
Alias for field number 7
- source_coords#
Alias for field number 0
- source_row#
Alias for field number 3
- gwas_norm.variants.constants.NORMALISED = MappingFlag(name='NORMALISED', bits=8, description='A flag indicating an insertion/deletion variant has normalised prior tomatching')#
A flag indicating an insertion/deletion variant has normalised prior to matching (MappingFlag)
- gwas_norm.variants.constants.NO_DATA = MappingFlag(name='NO_DATA', bits=0, description='A flag indicating a mapping has no data associated with it')#
A flag indicating a mapping has no data associated with it (MappingFlag)
- gwas_norm.variants.constants.PARTIAL_ALLELE_MATCH = MappingFlag(name='PARTIAL_ALLELE_MATCH', bits=16, description='A flag indicating that not all alternate alleles have been matched for avariant')#
A flag indicating that not all alternate alleles have been matched for a variant. Not currently used as partial allele allele matches map to NO_DATA (MappingFlag)
- gwas_norm.variants.constants.POP_START_IDX = 9#
The index position in the mapping VCF file where the population columns start (int)
- gwas_norm.variants.constants.REF = MappingFlag(name='REF', bits=8192, description='A flag indicating a variant has been mapped based on reference allele')#
A flag indicating a variant has been mapped based on reference allele (MappingFlag)
Note that this flag is applied after any reference allele or strand flipping during the matching process.
- gwas_norm.variants.constants.REF_FLIP = MappingFlag(name='REF_FLIP', bits=64, description='A flag indicating the variant reference allele has been flipped prior tomatching')#
A flag indicating the variant reference allele has been flipped prior to matching (MappingFlag)
- gwas_norm.variants.constants.RS_REGEX = re.compile('^rs[1-9]\\d*$')#
A pre-compiled regexp of constants.RS_STR a regular expression definition that looks for dbSNP SNP Ids (i.e. rsIDs) (re.pattern)
- gwas_norm.variants.constants.RS_STR = '^rs[1-9]\\d*$'#
A pre-compiled regexp of constants.RS_STR a regular expression definition that looks for dbSNP SNP Ids (i.e. rsIDs) (re.pattern)
- gwas_norm.variants.constants.START = MappingFlag(name='START', bits=2048, description='A flag indicating a mapping has no data associated with it')#
A flag indicating a variant has been mapped based on start coordinate (MappingFlag)
- gwas_norm.variants.constants.STRAND = MappingFlag(name='STRAND', bits=1024, description='A flag indicating a variant has been mapped based on strand')#
A flag indicating a variant has been mapped based on strand (MappingFlag)
Note that this flag is applied after any strand flipping during the matching process.
- gwas_norm.variants.constants.STRAND_FLIP = MappingFlag(name='STRAND_FLIP', bits=128, description='A flag indicating a variant strand has been flipped prior to matching')#
A flag indicating a variant strand has been flipped prior to matching (MappingFlag)
- gwas_norm.variants.constants.UNKNOWN_INDEL = MappingFlag(name='UNKNOWN_INDEL', bits=4, description='A flag indicating that a variant has been mapped from the unknowninsertion/deletion I/D/R specification')#
A flag indicating that a variant has been mapped from the unknown insertion/deletion I/D/R specification (MappingFlag)
- gwas_norm.variants.constants.VCF_FORMAT_IDX = 8#
The index position of the format column in the mapping VCF file (int)
- gwas_norm.variants.constants.VCF_ID_IDX = 2#
The index position of the ID column in the mapping VCF file (int)
- gwas_norm.variants.constants.VCF_INFO_IDX = 7#
The index position of the format column in the mapping VCF file (int)
- gwas_norm.variants.constants.dataset_bits(name)#
Get the bits for a dataset name, this is really designed to be called from bash
- gwas_norm.variants.constants.decode_mapping_flags(flags)#
Decode the bitwise mapping flags into human readable strings.
- Parameters:
flags (int) – The bitwise flags to decode.
- Returns:
decoded_flags – The bitwise flags decoded into human readable strings.
- Return type:
list of str
- gwas_norm.variants.constants.main()#
Main entry point
gwas_norm.variants.common
#
Common functions for the variants subpackage
- exception gwas_norm.variants.common.SequenceError#
Bases:
BaseException
- gwas_norm.variants.common.get_end_pos(start_pos, ref_allele)#
Skip the VCF header lines and return the first non header line
- gwas_norm.variants.common.get_uni_id(chr_name, start_pos, alleles)#
Generate a “universal ID”, these are not fullproof because of strand issues but will cover most cases.
- Parameters:
chr_name (str) – The string based chromosome name i.e. X is X not 23
start_pos (int) – The start position in bp
alleles (list of str) – The alleles for the variant
- Returns:
uni_id – The uni_id. This is: <chr_name>_<start_pos>_<upper_case_alleles_in_sort_order>
- Return type:
str
- gwas_norm.variants.common.skip_vcf_header(fobj)#
Skip the VCF header lines and return the first non header line and the header contents as a list. This is not a proper parser and does no error checking.
- Parameters:
fobj (file) – A file like object that responds to next()
- Returns:
line (str) – The first non-header line in the VCF (where the samples are defined)
header (list of str) – The VCF header
- gwas_norm.variants.common.split_alt(alt_str)#
Skip the VCF header lines and return the first non header line