Skip to content

Commit

Permalink
fix bugs in cojo cond
Browse files Browse the repository at this point in the history
  • Loading branch information
Jianhua-Wang committed Apr 17, 2024
1 parent a9995de commit 2a03e72
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 12 deletions.
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
# Changelog

## [0.4.4] - 2024-04-17
## [0.4.6] - 2024-04-17

### Added

### Changed

### Fixed
- fix bugs in cojo cond

## [0.4.5] - 2024-04-17

### Added

Expand Down
2 changes: 2 additions & 0 deletions easyfinemap/easyfinemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,8 @@ def cond_sumstat(
cojo_input = ld.intersect(
all_sumstats, ldref, f"{temp_dir}/cojo_input_{chrom}", use_ref_EAF
)
self.logger.debug(f"Lead SNP: {lead_snp}")
self.logger.debug(f"Conditional SNPs: {cond_snps['SNPID'].tolist()}")
cond_res = ld.cojo_cond(
cojo_input, cond_snps, f"{temp_dir}/cojo_input_{chrom}", sample_size, use_ref_EAF
) # type: ignore
Expand Down
106 changes: 95 additions & 11 deletions easyfinemap/ldref.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ def __init__(self):
self.tmp_root.mkdir(parents=True)

@io_in_tempdir(dir='./tmp/ldref')
def _clean_per_chr(self, inprefix: str, outprefix: str, mac: int = 10, temp_dir: Optional[str] = None) -> None:
def _clean_per_chr(
self, inprefix: str, outprefix: str, mac: int = 10, temp_dir: Optional[str] = None
) -> None:
"""
Clean the extracted LD reference per chromosome.
Expand Down Expand Up @@ -199,7 +201,9 @@ def valid(
raise FileNotFoundError(f"{inprefix}.bed not found.")
else:
# check if chrom is in the bim file
res = check_output(f'grep "^{chrom}[[:space:]]" {inprefix}.bim | head -n 1', shell=True)
res = check_output(
f'grep "^{chrom}[[:space:]]" {inprefix}.bim | head -n 1', shell=True
)
if len(res.decode()) == 0:
self.logger.warning(f"Chrom {chrom} not found in {inprefix}.bim")
continue
Expand Down Expand Up @@ -359,7 +363,9 @@ def intersect(
# raise RuntimeError(res.stderr)
freq = pd.read_csv(f"{temp_dir}/freq.frq", delim_whitespace=True)
freq['A2_frq'] = 1 - freq['MAF']
overlap_sumstat['EAF'] = freq['A2_frq'].where(freq['A2'] == overlap_sumstat['EA'], freq['MAF'])
overlap_sumstat['EAF'] = freq['A2_frq'].where(
freq['A2'] == overlap_sumstat['EA'], freq['MAF']
)
overlap_sumstat['MAF'] = freq['MAF']
return overlap_sumstat

Expand Down Expand Up @@ -455,7 +461,9 @@ def cojo_cond(
The updated summary statistics.
"""
if not use_ref_EAF and ColName.EAF not in sumstats.columns:
raise ValueError(f"{ColName.EAF} is not in the sumstats, please set use_ref_EAF to True")
raise ValueError(
f"{ColName.EAF} is not in the sumstats, please set use_ref_EAF to True"
)
chrom = sumstats[ColName.CHR].iloc[0]
# ld = LDRef()
# all_sumstats = pd.concat([sumstats, cond_snps], ignore_index=True)
Expand All @@ -466,7 +474,16 @@ def cojo_cond(
cojo_input = sumstats.copy()
cojo_input[ColName.N] = sample_size
cojo_input = cojo_input[
[ColName.SNPID, ColName.EA, ColName.NEA, ColName.EAF, ColName.BETA, ColName.SE, ColName.P, ColName.N]
[
ColName.SNPID,
ColName.EA,
ColName.NEA,
ColName.EAF,
ColName.BETA,
ColName.SE,
ColName.P,
ColName.N,
]
]
cojo_input.rename(
columns={
Expand Down Expand Up @@ -504,17 +521,82 @@ def cojo_cond(
self.logger.debug(f"conditional analysis: {' '.join(cmd)}")
res = run(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True)
if res.returncode != 0:
self.logger.error(res.stderr)
raise RuntimeError(res.stderr)
else:
cond_res = pd.read_csv(f"{cojo_outfile}.cma.cojo", sep="\t", usecols=["SNP", "bC", "bC_se", "pC"])
except_error = 'Error: there is a collinearity problem of the given list of SNPs.'
if except_error in res.stdout and os.path.exists(
f"{temp_dir}/cojo_{chrom}.cond.given.cojo"
):
self.logger.warning(
'there is a collinearity problem of the given list of SNPs. Try slct again'
)
cojo_input[cojo_input['SNP'].isin(cond_snps[ColName.SNPID])].to_csv(
f"{temp_dir}/cojo_{chrom}.reslct.ma", sep=" ", index=False
)
cmd = [
self.gcta,
"--bfile",
ldref,
"--cojo-file",
f"{temp_dir}/cojo_{chrom}.reslct.ma",
"--diff-freq",
"1",
"--cojo-collinear",
"0.9",
"--cojo-slct",
"--out",
f"{temp_dir}/cojo_{chrom}.reslct",
]
self.logger.debug(f"slct again: {' '.join(cmd)}")
res = run(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True)
if res.returncode != 0:
self.logger.error(res.stdout)
raise RuntimeError(res.stdout)
else:
new_conds = pd.read_csv(
f"{temp_dir}/cojo_{chrom}.reslct.jma.cojo", delim_whitespace=True
)
new_conds = new_conds[new_conds['pJ'] < 5e-8]['SNP'].values
with open(f"{temp_dir}/cojo_cond_{chrom}.snps", "w") as f:
f.write('\n'.join(new_conds))
cmd = [
self.gcta,
"--bfile",
ldref,
"--cojo-file",
cojo_p_file,
"--diff-freq",
"1",
"--cojo-collinear",
"0.99",
"--cojo-cond",
f"{temp_dir}/cojo_cond_{chrom}.snps",
"--out",
cojo_outfile,
]
self.logger.debug(f"conditional analysis: {' '.join(cmd)}")
res = run(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True)
if res.returncode != 0:
self.logger.error(res.stdout)
raise RuntimeError(res.stdout)
# self.logger.error(res.stdout)
# raise RuntimeError(res.stdout)
if os.path.exists(f"{cojo_outfile}.cma.cojo"):
cond_res = pd.read_csv(
f"{cojo_outfile}.cma.cojo", sep="\t", usecols=["SNP", "bC", "bC_se", "pC"]
)
cond_res.rename(
columns={"SNP": ColName.SNPID, "bC": ColName.COJO_BETA, "bC_se": ColName.COJO_SE, "pC": ColName.COJO_P},
columns={
"SNP": ColName.SNPID,
"bC": ColName.COJO_BETA,
"bC_se": ColName.COJO_SE,
"pC": ColName.COJO_P,
},
inplace=True,
)
output = sumstats.merge(cond_res, on=ColName.SNPID, how="left")
output = output.dropna(subset=[ColName.COJO_P, ColName.COJO_BETA, ColName.COJO_SE])
return output
else:
return sumstats

@io_in_tempdir('./tmp/ldref')
def annotate_r2(
Expand Down Expand Up @@ -547,7 +629,9 @@ def annotate_r2(
raise ValueError("Only one chromosome is allowed.")
chrom = sumstat[ColName.CHR].iloc[0]
if len(sumstat) > 100000:
self.logger.warning("The sumstats is large, it may take a long time to annotate the r2.")
self.logger.warning(
"The sumstats is large, it may take a long time to annotate the r2."
)
ld = LDRef()
r2_df = sumstat.copy()
r2_input = ld.intersect(sumstat, ldref.format(chrom=chrom), f"{temp_dir}/r2_input_{chrom}")
Expand Down

0 comments on commit 2a03e72

Please sign in to comment.