from pyspark.sql.types import * from pyspark.sql.functions import * from pyspark.sql.dataframe import * # Human genome annotations in GFF3 are available at https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/reference/GCF_000001405.39_GRCh38.p13/ gff_path = "/databricks-datasets/genomics/gffs/GCF_000001405.39_GRCh38.p13_genomic.gff.bgz" vcf_path = "/databricks-datasets/genomics/1kg-vcfs/ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz"
annotations_df = spark.read \ .format('gff') \ .load(gff_path) \ .filter("seqid = 'NC_000022.11'") \ .alias('annotations_df') annotations_df.printSchema()
root
|-- seqId: string (nullable = true)
|-- source: string (nullable = true)
|-- type: string (nullable = true)
|-- start: long (nullable = true)
|-- end: long (nullable = true)
|-- score: double (nullable = true)
|-- strand: string (nullable = true)
|-- phase: integer (nullable = true)
|-- ID: string (nullable = true)
|-- Name: string (nullable = true)
|-- Parent: array (nullable = true)
| |-- element: string (containsNull = true)
|-- Target: string (nullable = true)
|-- Gap: string (nullable = true)
|-- Note: array (nullable = true)
| |-- element: string (containsNull = true)
|-- Dbxref: array (nullable = true)
| |-- element: string (containsNull = true)
|-- Is_circular: boolean (nullable = true)
|-- align_id: string (nullable = true)
|-- allele: string (nullable = true)
|-- anticodon: string (nullable = true)
|-- assembly_bases_aln: string (nullable = true)
|-- assembly_bases_seq: string (nullable = true)
|-- batch_id: string (nullable = true)
|-- bit_score: string (nullable = true)
|-- blast_aligner: string (nullable = true)
|-- blast_score: string (nullable = true)
|-- bound_moiety: string (nullable = true)
|-- chromosome: string (nullable = true)
|-- codons: string (nullable = true)
|-- common_component: string (nullable = true)
|-- consensus_splices: string (nullable = true)
|-- country: string (nullable = true)
|-- crc32: string (nullable = true)
|-- curated_alignment: string (nullable = true)
|-- description: string (nullable = true)
|-- direction: string (nullable = true)
|-- e_value: string (nullable = true)
|-- end_range: string (nullable = true)
|-- exception: string (nullable = true)
|-- exon_identity: string (nullable = true)
|-- exon_number: string (nullable = true)
|-- experiment: string (nullable = true)
|-- feat_class: string (nullable = true)
|-- filter_score: string (nullable = true)
|-- for_remapping: string (nullable = true)
|-- function: string (nullable = true)
|-- gap_count: string (nullable = true)
|-- gbkey: string (nullable = true)
|-- gene: string (nullable = true)
|-- gene_biotype: string (nullable = true)
|-- gene_synonym: string (nullable = true)
|-- genome: string (nullable = true)
|-- hsp_percent_coverage: string (nullable = true)
|-- identity: string (nullable = true)
|-- idty: string (nullable = true)
|-- inference: string (nullable = true)
|-- isolation-source: string (nullable = true)
|-- lxr_locAcc_currStat_120: string (nullable = true)
|-- lxr_locAcc_currStat_35: string (nullable = true)
|-- map: string (nullable = true)
|-- matchable_bases: string (nullable = true)
|-- matched_bases: string (nullable = true)
|-- matches: string (nullable = true)
|-- merge_aligner: string (nullable = true)
|-- mobile_element_type: string (nullable = true)
|-- model_evidence: string (nullable = true)
|-- mol_type: string (nullable = true)
|-- not_for_annotation: string (nullable = true)
|-- num_ident: string (nullable = true)
|-- num_mismatch: string (nullable = true)
|-- number: string (nullable = true)
|-- part: string (nullable = true)
|-- partial: string (nullable = true)
|-- pct_coverage: string (nullable = true)
|-- pct_coverage_hiqual: string (nullable = true)
|-- pct_identity_gap: string (nullable = true)
|-- pct_identity_gapopen_only: string (nullable = true)
|-- pct_identity_ungap: string (nullable = true)
|-- product: string (nullable = true)
|-- product_coverage: string (nullable = true)
|-- promoted_rank: string (nullable = true)
|-- protein_id: string (nullable = true)
|-- pseudo: string (nullable = true)
|-- qtaxid: string (nullable = true)
|-- rank: string (nullable = true)
|-- recombination_class: string (nullable = true)
|-- regulatory_class: string (nullable = true)
|-- rpt_family: string (nullable = true)
|-- rpt_type: string (nullable = true)
|-- rpt_unit_range: string (nullable = true)
|-- rpt_unit_seq: string (nullable = true)
|-- satellite: string (nullable = true)
|-- splices: string (nullable = true)
|-- standard_name: string (nullable = true)
|-- start_range: string (nullable = true)
|-- tag: string (nullable = true)
|-- tissue-type: string (nullable = true)
|-- transcript_id: string (nullable = true)
|-- transl_except: string (nullable = true)
|-- transl_table: string (nullable = true)
|-- weighted_identity: string (nullable = true)
from pyspark.sql.functions import * parent_child_df = annotations_df \ .join( annotations_df.select('id', 'type', 'name', 'start', 'end').alias('parent_df'), col('annotations_df.parent')[0] == col('parent_df.id') # each row in annotation_df has at most one parent ) \ .orderBy('annotations_df.start', 'annotations_df.end') \ .select( 'annotations_df.seqid', 'annotations_df.type', 'annotations_df.start', 'annotations_df.end', 'annotations_df.id', 'annotations_df.name', col('annotations_df.parent')[0].alias('parent_id'), col('parent_df.Name').alias('parent_name'), col('parent_df.type').alias('parent_type'), col('parent_df.start').alias('parent_start'), col('parent_df.end').alias('parent_end') ) \ .alias('parent_child_df') display(parent_child_df)
Integrating Variants with Annotations using Glow
Glow addeds support for loading GFF3 files, which are commonly used to store annotations on genomic regions. When combined with variant data, genomic annotations provide context for each change in the genome. Does this mutation cause a change in the protein coding sequence of gene? If so, how does it change the protein? Or is the mutation in a low information part of the genome, also known as “junk DNA”. And everything in between. In this notebook, we demonstrate how to use Glow's APIs to work with annotations from the RefSeq database alongside genomic variants from the 1000 Genomes project.
To start, we will download annotations for the GRCh38 human reference genome from RefSeq and define the paths that we will load our data from.
Last refresh: Never