Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions experiments/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The experiments directory is for rough experimental code. See [documentation](docs/Getting_Started/e_experiments_directory.md).
40 changes: 40 additions & 0 deletions experiments/tralfamadorian97/explore_ank_spond.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import polars as pl
from pathlib import Path

from mecfs_bio.constants.gwaslab_constants import GWASLAB_CHROM_COL, GWASLAB_POS_COL, GWASLAB_EFFECT_ALLELE_COL, \
GWASLAB_NON_EFFECT_ALLELE_COL

finn_path = Path(
"assets/base_asset_store/gwas/ankylosing_spondylitis/finngne/processed/finngen_ank_spond_harmonized_dump_to_parquet.parquet")

mv_path = Path("assets/base_asset_store/gwas/ankylosing_spondylitis/million_veterans/processed/million_veterans_eur_ank_spond_harmonized_dump_to_parquet.parquet")
def go():
df = pl.read_parquet(finn_path)
counts = df.group_by(
[
GWASLAB_CHROM_COL,
GWASLAB_POS_COL,
GWASLAB_EFFECT_ALLELE_COL,
GWASLAB_NON_EFFECT_ALLELE_COL
],
).len().sort(by="len",descending=True)
import pdb; pdb.set_trace()
print("yo")

def go_mv():
df = pl.read_parquet(mv_path)
counts = df.group_by(
[
GWASLAB_CHROM_COL,
GWASLAB_POS_COL,
GWASLAB_EFFECT_ALLELE_COL,
GWASLAB_NON_EFFECT_ALLELE_COL
],
).len().sort(by="len",descending=True)
import pdb; pdb.set_trace()
print("yo")

if __name__ == "__main__":
go_mv()


25 changes: 25 additions & 0 deletions experiments/tralfamadorian97/get_min_pvalue_ankspond.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import polars as pl
def go():
import duckdb

# Define the path to your gzipped TSV file
file_path = 'assets/base_asset_store/gwas/ankylosing_spondylitis/uk_biobank/raw/GCST90474065.h.tsv.gz'

# Connect to an in-memory DuckDB database
con = duckdb.connect()

# Query the minimum p-value using read_csv_auto
# DuckDB natively handles gzip decompression and detects headers/delimiters
query = f"""
SELECT MIN(p_value) AS min_p
FROM read_csv_auto('{file_path}', delim='\t', header=True)
"""

# Execute the query and fetch the single row result
result = con.execute(query).fetchone()
min_p_value = result[0]

print(f"The minimum p-value is: {min_p_value}")

if __name__=="__main__":
go()
67 changes: 67 additions & 0 deletions experiments/tralfamadorian97/runs/as_runs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from mecfs_bio.analysis.runner.default_runner import DEFAULT_RUNNER
from mecfs_bio.assets.gwas.ankylosing_spondylitis.finngen.analysis.finngen_ank_spond_manhattan import \
FINGEN_ANK_SPOND_MANHATTAN
from mecfs_bio.assets.gwas.ankylosing_spondylitis.finngen.processed.finngen_ank_spond_sumstats import \
FINGNEN_ANK_SPOND_SUMSTATS
from mecfs_bio.assets.gwas.ankylosing_spondylitis.finngen.processed.finngen_ank_spond_sumstats_harmonized import \
FINGNEN_ANK_SPOND_SUMSTATS_HARMONIZED
from mecfs_bio.assets.gwas.ankylosing_spondylitis.finngen.raw.raw_finngen_ank_spond_data import \
FINNGEN_ANKYLOSING_SPONDYLITIS_DATA_RAW
from mecfs_bio.assets.gwas.ankylosing_spondylitis.fixed_effects_meta_analysis.processed.ank_spond_meta_analysis import \
ANK_SPOND_FIXED_EFFECTS_META_ANALYSIS
from mecfs_bio.assets.gwas.ankylosing_spondylitis.million_veterans.analysis.mv_ankspond_manhattan import \
MV_ANK_SPOND_MANHATTAN
from mecfs_bio.assets.gwas.ankylosing_spondylitis.million_veterans.processed.mv_eur_ank_spond_sumstats import \
MILLION_VETERAN_ANKYLOSING_SPONDYLITIS_SUMSTATS
from mecfs_bio.assets.gwas.ankylosing_spondylitis.million_veterans.processed.mv_eur_ank_spond_sumstats_dump_to_parquet import \
MILLION_VETERANS_ANK_SPOND_SUMSTATS_37_DUMP_TO_PARQUET
from mecfs_bio.assets.gwas.ankylosing_spondylitis.million_veterans.processed.mv_eur_ank_spond_sumstats_harmonized import \
MILLION_VETERANS_ANK_SPOND_SUMSTATS_HARMONIZED
from mecfs_bio.assets.gwas.ankylosing_spondylitis.million_veterans.raw.raw_mv_eur_ank_spond_data import \
MILLION_VETERAN_ANKYLOSING_SPONDYLITIS_EUR_DATA_RAW
from mecfs_bio.assets.gwas.ankylosing_spondylitis.ukbb.processed.ukbb_ank_spond_sumstats import \
UK_BIOBANK_ANKYLOSING_SPONDYLITIS_SUMSTATS
from mecfs_bio.assets.gwas.ankylosing_spondylitis.ukbb.processed.ukbb_ank_spond_sumstats_harmonized import \
UKBB_ANK_SPOND_SUMSTATS_HARMONIZED
from mecfs_bio.assets.gwas.ankylosing_spondylitis.ukbb.processed.ukbb_eur_ank_spond_filtered import \
FILTERED_UKBB_ANK_SPOND
from mecfs_bio.assets.gwas.ankylosing_spondylitis.ukbb.processed.ukbb_eur_ank_spond_parquet import \
UKBB_ANK_SPOND_PARQUET
from mecfs_bio.assets.gwas.ankylosing_spondylitis.ukbb.raw.raw_ukbb_eur_ank_spond_data import \
UK_BIOBANK_ANKYLOSING_SPONDYLITIS_EUR_DATA_RAW



def go():
DEFAULT_RUNNER.run(
(
[
# MILLION_VETERAN_ANKYLOSING_SPONDYLITIS_SUMSTATS,
# FINGNEN_ANK_SPOND_SUMSTATS
# MILLION_VETERAN_ANKYLOSING_SPONDYLITIS_EUR_DATA_RAW,
# FINNGEN_ANKYLOSING_SPONDYLITIS_DATA_RAW
# UK_BIOBANK_ANKYLOSING_SPONDYLITIS_EUR_DATA_RAW
# FILTERED_UKBB_ANK_SPOND
# UKBB_ANK_SPOND_PARQUET
# MILLION_VETERANS_ANK_SPOND_SUMSTATS_37_DUMP_TO_PARQUET
# FILTERED_UKBB_ANK_SPOND
# UK_BIOBANK_ANKYLOSING_SPONDYLITIS_SUMSTATS

# FINGNEN_ANK_SPOND_SUMSTATS_HARMONIZED,
# MILLION_VETERANS_ANK_SPOND_SUMSTATS_HARMONIZED,
#
# UKBB_ANK_SPOND_SUMSTATS_HARMONIZED
ANK_SPOND_FIXED_EFFECTS_META_ANALYSIS
# FINGEN_ANK_SPOND_MANHATTAN
# MV_ANK_SPOND_MANHATTAN
]
),

incremental_save=True,
must_rebuild_transitive=[
]

)

if __name__ == '__main__':
go()
19 changes: 19 additions & 0 deletions experiments/tralfamadorian97/runs/lupus_runs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from mecfs_bio.analysis.runner.default_runner import DEFAULT_RUNNER
from mecfs_bio.assets.gwas.systemic_lupus_erythematosus.bentham_et_al_2015.analysis_results.bentham_2015_standard_analysis import (
BENTHAM_LUPUS_STANDARD_ANALYSIS,
)


def lupus_analysis():
"""
Script to analyze the lupus GWAS of Bentham et al
"""
DEFAULT_RUNNER.run(
BENTHAM_LUPUS_STANDARD_ANALYSIS.magma_tasks.inner.terminal_tasks(),
incremental_save=True,
must_rebuild_transitive=[],
)


if __name__ == "__main__":
lupus_analysis()
10 changes: 10 additions & 0 deletions experiments/tralfamadorian97/try_ieugwas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import ieugwaspy
def go():
# result = ieugwaspy.gwasinfo_files("finn-b-M13_ANKYLOSPON")
result = ieugwaspy.gwasinfo_files("ukb-b-18194")
print(result)
import pdb; pdb.set_trace()
print("testing")

if __name__ == '__main__':
go()
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from mecfs_bio.assets.gwas.ankylosing_spondylitis.finngen.processed.finngen_ank_spond_sumstats_harmonized import \
FINGNEN_ANK_SPOND_SUMSTATS_HARMONIZED
from mecfs_bio.build_system.task.gwaslab.gwaslab_manhattan_and_qq_plot_task import GWASLabManhattanAndQQPlotTask

FINGEN_ANK_SPOND_MANHATTAN=GWASLabManhattanAndQQPlotTask.create(
FINGNEN_ANK_SPOND_SUMSTATS_HARMONIZED,
asset_id="fingen_ank_spond_manhattan",
plot_setting="m"
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from mecfs_bio.assets.gwas.ankylosing_spondylitis.finngen.processed.finngen_ank_spond_sumstats_harmonized import (
FINGNEN_ANK_SPOND_SUMSTATS_HARMONIZED,
)
from mecfs_bio.build_system.task.gwaslab.gwaslab_sumstats_to_table_task import (
GwasLabSumstatsToTableTask,
)

FINNGEN_ANK_SPOND_HARMONIZED_DUMP_TO_PARQUET = (
GwasLabSumstatsToTableTask.create_from_source_task(
source_tsk=FINGNEN_ANK_SPOND_SUMSTATS_HARMONIZED,
asset_id="finngen_ank_spond_harmonized_dump_to_parquet",
sub_dir="processed",
)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from mecfs_bio.assets.gwas.ankylosing_spondylitis.finngen.raw.raw_finngen_ank_spond_data import (
FINNGEN_ANKYLOSING_SPONDYLITIS_DATA_RAW,
)
from mecfs_bio.build_system.meta.asset_id import AssetId
from mecfs_bio.build_system.task.gwaslab.gwaslab_create_sumstats_task import (
GWASLabColumnSpecifiers,
GWASLabCreateSumstatsTask,
)
from mecfs_bio.build_system.task.pipes.composite_pipe import CompositePipe
from mecfs_bio.build_system.task.pipes.str_split_exact_col import SplitExactColPipe

FINGNEN_ANK_SPOND_SUMSTATS = GWASLabCreateSumstatsTask(
df_source_task=FINNGEN_ANKYLOSING_SPONDYLITIS_DATA_RAW,
asset_id=AssetId("finngen_ank_spond_sumstats"),
basic_check=True,
genome_build="infer",
liftover_to="19",
fmt=GWASLabColumnSpecifiers(
rsid="ID",
chrom="CHROM",
pos="POS",
nea="REF",
ea="ALT",
beta="ES",
se="SE",
mlog10p="LP",
eaf="AF",
),
pre_pipe=CompositePipe(
[
SplitExactColPipe(
col_to_split="finn-b-M13_ANKYLOSPON",
split_by=":",
new_col_names=tuple(["ES", "SE", "LP", "AF"]),
)
]
),
# pre_pipe=pre_pipe,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from mecfs_bio.assets.gwas.ankylosing_spondylitis.finngen.processed.finngen_ank_spond_sumstats import (
FINGNEN_ANK_SPOND_SUMSTATS,
)
from mecfs_bio.build_system.task.gwaslab.gwaslab_create_sumstats_task import (
GwasLabTransformSpec,
GWASLabVCFRef,
HarmonizationOptions,
)
from mecfs_bio.build_system.task.gwaslab.gwaslab_transform_sumstats import (
GWASLabTransformSumstatsTask,
)

FINGNEN_ANK_SPOND_SUMSTATS_HARMONIZED = (
GWASLabTransformSumstatsTask.create_from_source_task(
source_tsk=FINGNEN_ANK_SPOND_SUMSTATS,
asset_id="finngen_ank_spond_sumstats_37_harmonized",
spec=GwasLabTransformSpec(
harmonize_options=HarmonizationOptions(
ref_infer=GWASLabVCFRef(name="1kg_eur_hg19", ref_alt_freq="AF"),
ref_seq="ucsc_genome_hg19",
check_ref_files=True,
drop_missing_from_ref_seq=True,
drop_missing_from_ref_infer_or_ambiguous=True,
cores=1,
)
),
)
)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
Summary statistics from Ankylosing spondylitis in Finngen


See: https://opengwas.io/datasets/finn-b-M13_ANKYLOSPON#

opengwas only provides a temporary link, so I mirrored on dropbox.


Note: This dataset appears to contain some anomalies, probably due to the conversion of multi-allelic to bi-allelic genetic variants
- e.g. There are duplicated genetic variants with different effect sizes
"""

from pathlib import PurePath

import polars as pl

from mecfs_bio.build_system.meta.asset_id import AssetId
from mecfs_bio.build_system.meta.gwas_summary_file_meta import GWASSummaryDataFileMeta
from mecfs_bio.build_system.meta.read_spec.dataframe_read_spec import (
DataFrameReadSpec,
DataFrameTextFormat,
)
from mecfs_bio.build_system.task.download_file_task import DownloadFileTask

FINNGEN_ANKYLOSING_SPONDYLITIS_DATA_RAW = DownloadFileTask(
meta=GWASSummaryDataFileMeta(
id=AssetId("finngen_spond_eur_raw"),
trait="ankylosing_spondylitis",
project="finngne",
sub_dir="raw",
project_path=PurePath("finn-b-M13_ANKYLOSPON.vcf.gz"),
read_spec=DataFrameReadSpec(
format=DataFrameTextFormat(
separator="\t",
has_header=False,
column_names=[
"CHROM",
"POS",
"ID",
"REF",
"ALT",
"QUAL",
"FILTER",
"INFO",
"FORMAT",
"finn-b-M13_ANKYLOSPON",
],
comment_char="#",
schema_overrides={"CHROM": pl.String()},
)
),
),
url="https://www.dropbox.com/scl/fi/keanskgpwrf26ngdly3dm/finn-b-M13_ANKYLOSPON.vcf.gz?rlkey=gwzualqvpiyktd965yhpdpztn&dl=1",
md5_hash=None,
)
Loading
Loading