Skip to content

Commit 8669436

Browse files
committed
Add support for annotation files
1 parent fc7806a commit 8669436

7 files changed

Lines changed: 335 additions & 21 deletions

File tree

haploy.py

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
from bs4 import BeautifulSoup
77
import urllib.request
88
import json
9+
import glob
910

1011
def print_uptree(snpset, ut, do_print=True, b3x='b37'):
12+
prev_gl=[]
1113
rep=''
1214
y=snpset['Y'];
1315
pos=0
@@ -17,11 +19,33 @@ def print_uptree(snpset, ut, do_print=True, b3x='b37'):
1719
txt=''
1820
if 'txt' in mut:
1921
txt=mut['txt']
22+
otherg=mut['isog']
23+
if mut['ftg'] != '?':
24+
if mut['isog']:
25+
otherg+=', '
26+
otherg+=mut['ftg']
2027
if mut[b3x] in y:
21-
rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], y[mut[b3x]]['gen'], mut['raw'], mut['isog'], txt)
28+
rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], y[mut[b3x]]['gen'], mut['raw'], otherg, txt)
2229
else:
23-
rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], ' ', mut['raw'], mut['isog'], txt)
30+
rep += "%-1s%-11s%s %-30s %-32s %s\n"%(mut['tag'], mut['g'], ' ', mut['raw'], otherg, txt)
2431
pass
32+
33+
if not mut['g'] in prev_gl:
34+
if mut['g'] in annotations_by_g:
35+
for anno in annotations_by_g[mut['g']]:
36+
rep += "%45s[Y] %s\n"%('',anno['txt'])
37+
prev_gl.append(mut['g'])
38+
if not mut['ftg'] in prev_gl:
39+
if mut['ftg'] in annotations_by_g and not mut['g'] in annotations_by_g:
40+
for anno in annotations_by_g[mut['ftg']]:
41+
rep += "%45s[F] %s\n"%('',anno['txt'])
42+
if mut['ftg'] != '?':
43+
prev_gl.append(mut['ftg'])
44+
for m in mut['raw'].split('/'):
45+
m2=m.replace('(H)','')
46+
if m2 in annotations_by_m:
47+
for anno in annotations_by_m[m2]:
48+
rep += "%45s[M] %s\n"%('',anno['txt'])
2549
if do_print:
2650
print(rep)
2751
return rep
@@ -522,7 +546,6 @@ def load_yfull_snp(pages):
522546

523547
haplo_ybrowse_info = ''
524548
haplo_ybrowse_muts_by_name = {}
525-
haplo_ybrowse_muts_by_b38 = {}
526549

527550
# Source: http://www.ybrowse.org/gbrowse2/gff/
528551
def load_ybrowse_snp():
@@ -568,8 +591,6 @@ def load_ybrowse_snp():
568591
}
569592
if mname not in haplo_ybrowse_muts_by_name:
570593
haplo_ybrowse_muts_by_name[mname] = mut
571-
if b38 not in haplo_ybrowse_muts_by_b38:
572-
haplo_ybrowse_muts_by_b38[b38] = mut
573594
print("Lines in YBrowse snp DB: ", len(haplo_ybrowse_muts_by_name))
574595

575596
# Convert formats with CrossMap and chain file in crossmap/
@@ -777,13 +798,8 @@ def decode_entry(e):
777798
else:
778799
if mut['t'] != m['t']:
779800
print('FTDNA der mismatch:', e, mut['t'], m['t'], mut, m)
780-
#if not 'isog' in m:
781-
# m['isog']=''
782-
#else:
783-
if 'isog' in m:
784-
if m['isog']:
785-
m['isog']+=', '
786-
m['isog']+=mut['g']+''
801+
if 'b38' in m:
802+
m['ftg']=mut['g']
787803
return m
788804

789805
def yfull_fname(group):
@@ -894,6 +910,14 @@ def yfull_recurse_list(ul_in, level, fileroot):
894910
#muts['f']=dec['f']
895911
mutse['t']=dec['t']
896912
mutse['isog']=dec['isog']
913+
mutse['ftg']='?'
914+
if 'ftg' in dec:
915+
mutse['ftg']=dec['ftg']
916+
#discard far matches
917+
if not mutse['isog'].startswith(mutse['g'][0]):
918+
mutse['isog']=''
919+
if not mutse['ftg'].startswith(mutse['g'][0]):
920+
mutse['ftg']='?'
897921
mutse['b36']=dec['b36']
898922
mutse['b37']=dec['b37']
899923
mutse['b38']=dec['b38']
@@ -1030,5 +1054,23 @@ def import_ftdna_tree():
10301054
skip=1
10311055
print('FTDNA Tree database size is %d nodes'%len(haplo_ftdna_muts_list))
10321056

1057+
annotations_by_g = {}
1058+
annotations_by_m = {}
1059+
def load_annotations(fname):
1060+
files = glob.glob(fname)
1061+
for fn in files:
1062+
with open(fn, 'r') as f:
1063+
print('Loading annotation file %s'%fn)
1064+
jdata = json.load(f)
1065+
for anno in jdata['annotation']:
1066+
#print(anno)
1067+
if 'g' in anno and anno['g']:
1068+
if not anno['g'] in annotations_by_g:
1069+
annotations_by_g[anno['g']] = []
1070+
annotations_by_g[anno['g']].append(anno)
1071+
if 'm' in anno and anno['m']:
1072+
if not anno['m'] in annotations_by_m:
1073+
annotations_by_m[anno['m']] = []
1074+
annotations_by_m[anno['m']].append(anno)
10331075

10341076

haploy_anno_import.py

Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
#!/usr/bin/python3
2+
import re
3+
import csv
4+
import os
5+
from bs4 import BeautifulSoup
6+
import urllib.request
7+
import json
8+
import glob
9+
10+
11+
12+
13+
def yfull_fname(group):
14+
if group:
15+
return 'yfull/yfull-ytree-'+group+'.html'
16+
else:
17+
return 'yfull/yfull-ytree.html'
18+
19+
def yfull_url(group):
20+
if group:
21+
return 'https://www.yfull.com/tree/' + group + '/'
22+
else:
23+
return 'https://www.yfull.com/tree/'
24+
25+
# YFull mtree import (experimental)
26+
def download_yfull_file(group):
27+
try:
28+
os.mkdir('yfull')
29+
except OSError:
30+
pass
31+
fname = yfull_fname(group)
32+
url = yfull_url(group)
33+
print('Downloading ' + url + 'to file: ' + fname)
34+
#urllib.request.urlretrieve("https://www.yfull.com/tree/"+group+"/", fname);
35+
36+
def yfull_parse_muts(li):
37+
s=''
38+
snpforhg=li.find('span', class_='yf-snpforhg', recursive=False)
39+
if snpforhg:
40+
s+=snpforhg.text
41+
plussnps=li.find('span', class_='yf-plus-snps', recursive=False)
42+
if plussnps:
43+
s += ' * ' + plussnps['title']
44+
o=[]
45+
if len(s) > 0:
46+
for m in s.split('*'):
47+
o.append(m.strip())
48+
return o
49+
50+
def yfull_parse_age(li):
51+
s=''
52+
agespan=li.find('span', class_='yf-age', recursive=False)
53+
if agespan:
54+
s+=agespan.text
55+
return s
56+
57+
def yfull_parse_person(li):
58+
sams=[]
59+
ul = li.find('ul', recursive=False)
60+
if ul:
61+
lis = ul.find_all('li', recursive=False)
62+
else:
63+
return sams
64+
if not lis:
65+
return sams
66+
for li in lis:
67+
has_sample=0
68+
sam=''
69+
if li.has_attr('valsampleid'):
70+
sam+=li['valsampleid']+ ': '
71+
has_sample=1
72+
for geo in li.find_all('b', recursive=False):
73+
if geo.has_attr('class') and 'yf-geo' in geo['class'] and 'fl' in geo['class']:
74+
if geo.has_attr('title'):
75+
sam+=geo['title']
76+
if geo.has_attr('original-title'):
77+
sam+=geo['original-title']
78+
sam+=' '
79+
if geo.has_attr('class') and 'yf-geo' in geo['class'] and 'yf-lang' in geo['class']:
80+
if geo.has_attr('title'):
81+
sam+=geo['title']
82+
if geo.has_attr('original-title'):
83+
sam+=geo['original-title']
84+
for geo in li.find_all('span', recursive=False):
85+
if geo.has_attr('class') and 'yf-a-age' in geo['class']:
86+
if geo.has_attr('title'):
87+
sam+=geo['title']
88+
if geo.has_attr('original-title'):
89+
sam+=geo['original-title']
90+
if has_sample:
91+
sams.append(sam)
92+
#sam+=' '
93+
#print(sam)
94+
return sams
95+
96+
def yfull_is_tree_quirk(group_name, fileroot):
97+
if fileroot:
98+
return False
99+
if group_name=='R-P312':
100+
return True
101+
if group_name=='R-Z2118':
102+
return True
103+
return False
104+
105+
def yfull_recurse_list(ul_in, level, fileroot):
106+
lis = ul_in.find_all('li', recursive=False)
107+
for li in lis:
108+
#print(li.get_text())
109+
muts={}
110+
muts['l']=level
111+
g=li.find('a', recursive=False)
112+
group_name=''
113+
if g:
114+
group_name=g.text
115+
muts['g']=g.text
116+
txts = yfull_parse_person(li)
117+
grp=g.text.strip('*')
118+
for txt in txts:
119+
print(grp, txt)
120+
anno = {
121+
"g": grp,
122+
"txt": 'YFULL: %s'%(txt)
123+
}
124+
annos.append(anno)
125+
l=li.find('a', href=True, recursive=False)
126+
if l:
127+
muts['link']=l['href']
128+
129+
130+
ul = li.find('ul', recursive=False)
131+
if ul and not yfull_is_tree_quirk(group_name, fileroot):
132+
#print('->')
133+
yfull_recurse_list(ul, level+1, False)
134+
#print('<-')
135+
else:
136+
if 'g' in muts and muts['g'].endswith('*'):
137+
continue
138+
if 'link' in muts:
139+
group=muts['link'].split('/')[-2]
140+
#print('FILE: ' +fname)
141+
yfull_recurse_file(group, level)
142+
#print('END: ' +fname)
143+
return 0
144+
145+
def yfull_recurse_file(group, level):
146+
fname = yfull_fname(group)
147+
try:
148+
with open(fname) as f:
149+
pass
150+
except OSError:
151+
print('File not found: ' +fname)
152+
download_yfull_file(group)
153+
154+
with open(fname) as f:
155+
print('Importing file: ' +fname)
156+
soup = BeautifulSoup(f.read(), features="html.parser")
157+
ul = soup.find('ul', id='tree')
158+
yfull_recurse_list(ul, level, True)
159+
#yfull_get_info(soup)
160+
161+
def import_yfull_tree(gr):
162+
yfull_recurse_file(gr, 0)
163+
164+
165+
166+
167+
168+
#
169+
def import_ftdna_chart(fname, info=''):
170+
with open(fname) as f:
171+
print('Importing file: ' +fname)
172+
soup = BeautifulSoup(f.read(), features="html.parser")
173+
174+
#rows = soup.find('div', id='MainContent_color1_GridView1').find('table').find_all("tr")
175+
#rows = soup.find('table').find_all("tr")
176+
rows = soup.find('div', {"id" : re.compile('MainContent.*')}).find('table').find_all("tr")
177+
178+
kiti = -1
179+
pati = -1
180+
coui = -1
181+
gri = -1
182+
row = rows[0]
183+
ths = row.find_all("th")
184+
for i, th in enumerate(ths):
185+
if 'Kit' in th.get_text():
186+
kiti = i
187+
if 'Paternal' in th.get_text():
188+
pati = i
189+
if 'Country' in th.get_text():
190+
coui = i
191+
if 'Haplogroup' in th.get_text():
192+
gri = i
193+
for row in rows:
194+
tds = row.find_all("td")
195+
if len(tds)>1:
196+
kit=''
197+
pat=''
198+
cou=''
199+
gr=''
200+
kit = tds[kiti].get_text().strip()
201+
if pati >= 0:
202+
pat = tds[pati].get_text().strip()
203+
if coui >= 0:
204+
cou = tds[coui].get_text().strip()
205+
gr = tds[gri].get_text().strip()
206+
if not gr:
207+
continue
208+
#if 'MIN' in kit or 'MAX' in kit or 'MODE' in kit:
209+
# continue
210+
print(kit, pat, gr)
211+
anno = {
212+
"g": gr,
213+
"txt": 'FTDNA: %s %s %s'%(kit, pat, info)
214+
}
215+
annos.append(anno)
216+
217+
def save_anno(fname):
218+
jroot={
219+
'info': 'haploy_anno_iport.py',
220+
'annotation': annos }
221+
with open(fname, 'w') as f:
222+
json.dump(jroot, f, indent=1);
223+
224+
225+
# Example annotations - it probably doesn't make sense for everyone to import every project
226+
227+
annos=[]
228+
import_yfull_tree('A00')
229+
import_yfull_tree('A0-T')
230+
#import_yfull_tree('N-FGC28435')
231+
#import_yfull_tree('N')
232+
save_anno('haploy_annodb_yfull.txt')
233+
234+
annos=[]
235+
import_ftdna_chart('ftdna/FamilyTreeDNA - Estonia.htm', '[Estonia]')
236+
import_ftdna_chart('ftdna/FamilyTreeDNA - Saami Project.htm', '[Saami]')
237+
import_ftdna_chart('ftdna/FamilyTreeDNA - I1 Suomi Finland & N-CTS8565 -projekti.htm', '[I1 Suomi]')
238+
import_ftdna_chart('ftdna/FamilyTreeDNA - Finland DNA Project.htm', '[FinlandDNA]')
239+
import_ftdna_chart('ftdna/FamilyTreeDNA - RussiaDNA Project.htm', '[RussiaDNA]')
240+
import_ftdna_chart('ftdna/FamilyTreeDNA - R1a1a and Subclades Y-DNA Project.htm', '[R1a1a]')
241+
save_anno('haploy_annodb_ftdnatest.txt')

haploy_annodb_example.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"info": "Test file",
3+
"annotation": [
4+
{"m": "M2019", "txt": "Yakuts"},
5+
{"m": "Y13850", "txt": "Ugric"},
6+
{"m": "Y10932", "txt": "Rurikids"},
7+
{"m": "L550", "txt": "Scandic/Baltic"},
8+
{"m": "Z1933", "txt": "Savo/Karjala"},
9+
{"m": "CTS9976", "txt": "Finns"},
10+
{"m": "VL62", "txt": "Karjala"},
11+
{"m": "CTS8565", "txt": "Savo"},
12+
{"m": "M7414", "txt": "Kärsä-Laitinen"},
13+
{"m": "VL29", "txt": "North European"},
14+
{"m": "PH521", "txt": "Lapland"},
15+
{"m": "L1022", "txt": "Häme/Länsi-Suomi"}
16+
]
17+
}

haploy_find.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
print("Loading DB2...")
4242
haploy.load_db2j(min_tree_load_level=min_tree_load_level)
4343
print("DB loaded!")
44+
haploy.load_annotations('haploy_annodb_*.txt')
4445
rep = haploy.report(args.file[0], n_single, do_all=all, filt=filt, force=force, min_match_level=min_match_level)
4546
print(rep)
4647
else:

0 commit comments

Comments
 (0)