Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DO NOT MERGE: proper simulations #47

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 12 additions & 14 deletions sim/combine_random_sim_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,27 +99,25 @@ def main():
parse_bounds(boundsfiles).to_csv(args.out + '-bounds.csv', index=False)

spansfiles = glob.glob(args.str_dir+"/*-spanning.txt")
if len(spansfiles) == 0:
sys.exit('ERROR: No -spanning.txt files found in the given directory: ' + args.str_dir)
parse_spans(spansfiles).to_csv(args.out + '-spans.csv', index=False)
if len(spansfiles) >= 0:
parse_spans(spansfiles).to_csv(args.out + '-spans.csv', index=False)

unplacedfiles = glob.glob(args.str_dir+"/*-unplaced.txt")
if len(unplacedfiles) == 0:
sys.exit('ERROR: No -unplaced.txt files found in the given directory: ' + args.str_dir)
parse_unplaced(unplacedfiles).to_csv(args.out + '-unplaced.csv', index=False)

readsfiles = glob.glob(args.str_dir+"/*-reads.txt")
if len(readsfiles) == 0:
sys.exit('ERROR: No -reads.txt files found in the given directory: ' + args.str_dir)
all_df_reads = []
for f in readsfiles:
df_reads = pd.read_csv(f, sep='\t')
df_reads.rename(columns={"#chrom": "chrom"}, inplace=True)
df_reads['sim'] = get_sim_str(f)
all_df_reads.append(df_reads)
all_df = pd.concat(all_df_reads)
all_df = all_df.merge(bed_df, how='left', on='sim')
all_df.to_csv(args.out + '-results.csv', index=False)
if len(readsfiles) >= 0:
all_df_reads = []
for f in readsfiles:
df_reads = pd.read_csv(f, sep='\t')
df_reads.rename(columns={"#chrom": "chrom"}, inplace=True)
df_reads['sim'] = get_sim_str(f)
all_df_reads.append(df_reads)
all_df = pd.concat(all_df_reads)
all_df = all_df.merge(bed_df, how='left', on='sim')
all_df.to_csv(args.out + '-results.csv', index=False)

if __name__ == "__main__":
main()
3 changes: 1 addition & 2 deletions sim/sim_shared.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,7 @@ str_call = {
def sample = branch.name.prefix

from (sample + ".str.bin", sample + ".bam", "strling-bounds.txt") produce(
sample + "-reads.txt", sample + "-bounds.txt",
sample + "-spanning.txt", sample + "-unplaced.txt",
sample + "-bounds.txt", sample + "-unplaced.txt",
sample + "-genotype.txt") {

def bamname = get_fname(input.bam)
Expand Down
12 changes: 6 additions & 6 deletions sim/simulate_random.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ mutate_locus = {
produce('*.bed') {
exec """
$PYTHON $TOOLS/random_str_alleles.py
--locus "4 3076604 3076695 CAG"
--locus "7 117143769 117143769 CAG"
--out $dir/
--num 100
--min -5
--max 200
--num 150
--min 0
--max 600
--fixed 0
--seed 7
"""
Expand All @@ -36,15 +36,15 @@ simulate_str_reads = {
~/storage/git/STRling/src/strpkg/simulate_reads
--fasta $REF
--output $output.prefix
$input.hist
$input.cram
$input.bed
"""
}

combine = {

exec """
$PYTHON $TOOLS/combine_random_sim_results.py --bed_dir sim --str_dir str --out HTT
$PYTHON $TOOLS/combine_random_sim_results.py --bed_dir sim --str_dir str --out sim
"""
}

Expand Down
142 changes: 71 additions & 71 deletions src/strpkg/call.nim
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,24 @@ import ./unpack
export tread
export Soft

const n_sims = 100
const each_n = 2
proc string_hash(s:string, sim_i:int): uint64 {.inline.} =
# djb's string hash
result = 5381
for c in s:
result = ((result shl 5) + result) + c.uint64

for c in $sim_i:
result = ((result shl 5) + result) + c.uint64

proc filter_sims[T:tread|Support](trs: seq[T], sim_i: int, each_n:int): seq[T] =
result = trs
if result.len == 0: return
randomize(sim_i + 1)
shuffle(result)
result = result[0..<max(1, int(result.len/each_n))]

proc call_main*() =
var p = newParser("strling call"):
option("-f", "--fasta", help="path to fasta file")
Expand Down Expand Up @@ -143,6 +161,8 @@ proc call_main*() =
# Parse -bounds.txt file of regions. These will also be genotyped
bounds = parse_bounds(args.bounds, opts.targets)
stderr.write_line &"Read {len(bounds)} bounds from {args.bounds}"
else:
quit "-b bounds file is required for this branch"

# Merge loci and bounds, with loci overwriting bounds
for bound in bounds.mitems:
Expand All @@ -159,96 +179,76 @@ proc call_main*() =
for locus in loci:
bounds.add(locus)

var unplaced_counts = initCountTable[string]()
var genotypes_by_repeat = initTable[string, seq[Call]]()

# Assign STR reads to provided bounds and genotype them
for bound in bounds.mitems:
var str_reads = assign_reads_locus(bound, treads_by_tid_rep)
if bound.right - bound.left > 1000'u32:
stderr.write_line "large bounds:" & $bound & " skipping"
continue
var (spans, median_depth) = ibam.spanners(bound, opts.window, frag_dist, opts.min_mapq)
if spans.len > 5_000:
when defined(debug):
stderr.write_line &"High depth for bound {opts.targets[bound.tid].name}:{bound.left}-{bound.right} got {spans.len} pairs. Skipping."
continue
if median_depth == -1:
continue

var gt = genotype(bound, str_reads, spans, opts, float(median_depth))

var canon_repeat = bound.repeat.canonical_repeat
if not genotypes_by_repeat.hasKey(canon_repeat):
genotypes_by_repeat[canon_repeat] = @[]
genotypes_by_repeat[canon_repeat].add(gt)

#var estimate = spans.estimate_size(frag_dist)
bounds_fh.write_line bound.tostring(opts.targets) & "\t" & $median_depth
when defined(debug):
for s in spans:
span_fh.write_line s.tostring(bound, opts.targets[bound.tid].name)
var locusid = bound.id(opts.targets)
for r in str_reads:
reads_fh.write_line r.tostring(opts.targets) & "\t" & $locusid

# Cluster remaining reads and genotype
var ci = 0

for key, treads in treads_by_tid_rep.mpairs:
## treads are for the single tid, repeat unit defined by key
for c in treads.cluster(max_dist=opts.window.uint32, min_supporting_reads=opts.min_support):
if c.reads[0].tid == -1:
unplaced_counts[c.reads[0].repeat.tostring] = c.reads.len
continue
t0 = cpuTime()

var spansByBound = initTable[Bounds, seq[Support]]()
var treadsByBound = initTable[Bounds, seq[tread]]()
var medByBound = initTable[Bounds, int]()
for sim_i in 0..<n_sims:
echo "sim:", sim_i, " time:", cpuTime() - t0
t0 = cpuTime()
var unplaced_counts = initCountTable[string]()
var genotypes_by_repeat = initTable[string, seq[Call]]()

var b: Bounds
var good_cluster: bool
(b, good_cluster) = bounds(c, min_clip, min_clip_total)
if good_cluster == false:
for bound in bounds.mitems:

if bound.right - bound.left > 1000'u32:
stderr.write_line "large bounds:" & $bound & " skipping"
continue

var (spans, median_depth) = ibam.spanners(b, opts.window, frag_dist, opts.min_mapq)
if bound notin treadsByBound:
treadsByBound[bound] = assign_reads_locus(bound, treads_by_tid_rep)
var str_reads = treadsByBound[bound].filter_sims(sim_i, each_n)

if bound notin spansByBound:
var (s, md) = ibam.spanners(bound, opts.window, frag_dist, opts.min_mapq)
spansByBound[bound] = s
medByBound[bound] = md

var spans = spansByBound[bound].filter_sims(sim_i, each_n)
var median_depth = medByBound[bound]

if spans.len > 5_000:
when defined(debug):
stderr.write_line &"High depth for bound {opts.targets[b.tid].name}:{b.left}-{b.right} got {spans.len} pairs. Skipping."
stderr.write_line &"High depth for bound {opts.targets[bound.tid].name}:{bound.left}-{bound.right} got {spans.len} pairs. Skipping."
continue
if median_depth == -1:
continue

var gt = genotype(b, c.reads, spans, opts, float(median_depth))
var gt = genotype(bound, str_reads, spans, opts, float(median_depth))

var canon_repeat = b.repeat.canonical_repeat
var canon_repeat = bound.repeat.canonical_repeat
if not genotypes_by_repeat.hasKey(canon_repeat):
genotypes_by_repeat[canon_repeat] = @[]
genotypes_by_repeat[canon_repeat].add(gt)

#var estimate = spans.estimate_size(frag_dist)
bounds_fh.write_line b.tostring(opts.targets) & "\t" & $median_depth

bounds_fh.write_line bound.tostring(opts.targets) & "\t" & $median_depth
when defined(debug):
for s in spans:
span_fh.write_line s.tostring(b, opts.targets[b.tid].name)
for s in c.reads:
reads_fh.write_line s.tostring(opts.targets) & "\t" & $ci
ci += 1

# Loop through again and refine genotypes for loci that are the only
# large expansion with that repeat unit
for repeat, genotypes in genotypes_by_repeat:
var gt_expanded: seq[Call]
for gt in genotypes:
if gt.is_large:
gt_expanded.add(gt)
if gt_expanded.len > 1:
break
if gt_expanded.len == 1:
gt_expanded[0].update_genotype(unplaced_counts[repeat])
for gt in genotypes:
gt_fh.write_line gt.tostring()

for repeat, count in unplaced_counts:
unplaced_fh.write_line &"{repeat}\t{count}"
span_fh.write_line s.tostring(bound, opts.targets[bound.tid].name)
var locusid = bound.id(opts.targets)
for r in str_reads:
reads_fh.write_line r.tostring(opts.targets) & "\t" & $locusid

# Loop through again and refine genotypes for loci that are the only
# large expansion with that repeat unit
for repeat, genotypes in genotypes_by_repeat:
var gt_expanded: seq[Call]
for gt in genotypes:
if gt.is_large:
gt_expanded.add(gt)
if gt_expanded.len > 1:
break
if gt_expanded.len == 1:
gt_expanded[0].update_genotype(unplaced_counts[repeat])
for gt in genotypes:
gt_fh.write_line gt.tostring()

for repeat, count in unplaced_counts:
unplaced_fh.write_line &"{repeat}\t{count}"

gt_fh.close
bounds_fh.close
Expand Down
10 changes: 9 additions & 1 deletion src/strpkg/cluster.nim
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,14 @@ type Bounds* = object
name*: string
force_report*: bool

import hashes
proc hash*(b:Bounds): Hash =
var h:Hash = 0
h = h !& b.tid
h = h !& b.left.int32
h = h !& b.right.int32
result = !$h

const bounds_header* = "#chrom\tleft\tright\trepeat\tname\tleft_most\tright_most\tcenter_mass\tn_left\tn_right\tn_total"

proc `==`(a,b: Bounds): bool =
Expand Down Expand Up @@ -135,7 +143,7 @@ proc parse_bed*(f:string, targets: seq[Target], window: uint32): seq[Bounds] =
# Parse single line of a STRling bounds file
proc parse_boundsline*(l:string, targets: seq[Target]): Bounds =
var l_split = l.split("\t")
if len(l_split) != 11:
if len(l_split) notin [11, 12]:
quit fmt"Error reading loci bed file. Expected 11 fields and got {len(l_split)} on line: {l}"
result.tid = int32(get_tid(l_split[0], targets))
result.left = uint32(parseInt(l_split[1]))
Expand Down