|
| 1 | +#!/usr/bin/python3 |
| 2 | +import re |
| 3 | +import csv |
| 4 | +import os |
| 5 | +from bs4 import BeautifulSoup |
| 6 | +import urllib.request |
| 7 | +import json |
| 8 | +import glob |
| 9 | + |
| 10 | + |
| 11 | + |
| 12 | + |
| 13 | +def yfull_fname(group): |
| 14 | + if group: |
| 15 | + return 'yfull/yfull-ytree-'+group+'.html' |
| 16 | + else: |
| 17 | + return 'yfull/yfull-ytree.html' |
| 18 | + |
| 19 | +def yfull_url(group): |
| 20 | + if group: |
| 21 | + return 'https://www.yfull.com/tree/' + group + '/' |
| 22 | + else: |
| 23 | + return 'https://www.yfull.com/tree/' |
| 24 | + |
| 25 | +# YFull mtree import (experimental) |
| 26 | +def download_yfull_file(group): |
| 27 | + try: |
| 28 | + os.mkdir('yfull') |
| 29 | + except OSError: |
| 30 | + pass |
| 31 | + fname = yfull_fname(group) |
| 32 | + url = yfull_url(group) |
| 33 | + print('Downloading ' + url + 'to file: ' + fname) |
| 34 | + #urllib.request.urlretrieve("https://www.yfull.com/tree/"+group+"/", fname); |
| 35 | + |
| 36 | +def yfull_parse_muts(li): |
| 37 | + s='' |
| 38 | + snpforhg=li.find('span', class_='yf-snpforhg', recursive=False) |
| 39 | + if snpforhg: |
| 40 | + s+=snpforhg.text |
| 41 | + plussnps=li.find('span', class_='yf-plus-snps', recursive=False) |
| 42 | + if plussnps: |
| 43 | + s += ' * ' + plussnps['title'] |
| 44 | + o=[] |
| 45 | + if len(s) > 0: |
| 46 | + for m in s.split('*'): |
| 47 | + o.append(m.strip()) |
| 48 | + return o |
| 49 | + |
| 50 | +def yfull_parse_age(li): |
| 51 | + s='' |
| 52 | + agespan=li.find('span', class_='yf-age', recursive=False) |
| 53 | + if agespan: |
| 54 | + s+=agespan.text |
| 55 | + return s |
| 56 | + |
| 57 | +def yfull_parse_person(li): |
| 58 | + sams=[] |
| 59 | + ul = li.find('ul', recursive=False) |
| 60 | + if ul: |
| 61 | + lis = ul.find_all('li', recursive=False) |
| 62 | + else: |
| 63 | + return sams |
| 64 | + if not lis: |
| 65 | + return sams |
| 66 | + for li in lis: |
| 67 | + has_sample=0 |
| 68 | + sam='' |
| 69 | + if li.has_attr('valsampleid'): |
| 70 | + sam+=li['valsampleid']+ ': ' |
| 71 | + has_sample=1 |
| 72 | + for geo in li.find_all('b', recursive=False): |
| 73 | + if geo.has_attr('class') and 'yf-geo' in geo['class'] and 'fl' in geo['class']: |
| 74 | + if geo.has_attr('title'): |
| 75 | + sam+=geo['title'] |
| 76 | + if geo.has_attr('original-title'): |
| 77 | + sam+=geo['original-title'] |
| 78 | + sam+=' ' |
| 79 | + if geo.has_attr('class') and 'yf-geo' in geo['class'] and 'yf-lang' in geo['class']: |
| 80 | + if geo.has_attr('title'): |
| 81 | + sam+=geo['title'] |
| 82 | + if geo.has_attr('original-title'): |
| 83 | + sam+=geo['original-title'] |
| 84 | + for geo in li.find_all('span', recursive=False): |
| 85 | + if geo.has_attr('class') and 'yf-a-age' in geo['class']: |
| 86 | + if geo.has_attr('title'): |
| 87 | + sam+=geo['title'] |
| 88 | + if geo.has_attr('original-title'): |
| 89 | + sam+=geo['original-title'] |
| 90 | + if has_sample: |
| 91 | + sams.append(sam) |
| 92 | + #sam+=' ' |
| 93 | + #print(sam) |
| 94 | + return sams |
| 95 | + |
| 96 | +def yfull_is_tree_quirk(group_name, fileroot): |
| 97 | + if fileroot: |
| 98 | + return False |
| 99 | + if group_name=='R-P312': |
| 100 | + return True |
| 101 | + if group_name=='R-Z2118': |
| 102 | + return True |
| 103 | + return False |
| 104 | + |
| 105 | +def yfull_recurse_list(ul_in, level, fileroot): |
| 106 | + lis = ul_in.find_all('li', recursive=False) |
| 107 | + for li in lis: |
| 108 | + #print(li.get_text()) |
| 109 | + muts={} |
| 110 | + muts['l']=level |
| 111 | + g=li.find('a', recursive=False) |
| 112 | + group_name='' |
| 113 | + if g: |
| 114 | + group_name=g.text |
| 115 | + muts['g']=g.text |
| 116 | + txts = yfull_parse_person(li) |
| 117 | + grp=g.text.strip('*') |
| 118 | + for txt in txts: |
| 119 | + print(grp, txt) |
| 120 | + anno = { |
| 121 | + "g": grp, |
| 122 | + "txt": 'YFULL: %s'%(txt) |
| 123 | + } |
| 124 | + annos.append(anno) |
| 125 | + l=li.find('a', href=True, recursive=False) |
| 126 | + if l: |
| 127 | + muts['link']=l['href'] |
| 128 | + |
| 129 | + |
| 130 | + ul = li.find('ul', recursive=False) |
| 131 | + if ul and not yfull_is_tree_quirk(group_name, fileroot): |
| 132 | + #print('->') |
| 133 | + yfull_recurse_list(ul, level+1, False) |
| 134 | + #print('<-') |
| 135 | + else: |
| 136 | + if 'g' in muts and muts['g'].endswith('*'): |
| 137 | + continue |
| 138 | + if 'link' in muts: |
| 139 | + group=muts['link'].split('/')[-2] |
| 140 | + #print('FILE: ' +fname) |
| 141 | + yfull_recurse_file(group, level) |
| 142 | + #print('END: ' +fname) |
| 143 | + return 0 |
| 144 | + |
| 145 | +def yfull_recurse_file(group, level): |
| 146 | + fname = yfull_fname(group) |
| 147 | + try: |
| 148 | + with open(fname) as f: |
| 149 | + pass |
| 150 | + except OSError: |
| 151 | + print('File not found: ' +fname) |
| 152 | + download_yfull_file(group) |
| 153 | + |
| 154 | + with open(fname) as f: |
| 155 | + print('Importing file: ' +fname) |
| 156 | + soup = BeautifulSoup(f.read(), features="html.parser") |
| 157 | + ul = soup.find('ul', id='tree') |
| 158 | + yfull_recurse_list(ul, level, True) |
| 159 | + #yfull_get_info(soup) |
| 160 | + |
| 161 | +def import_yfull_tree(gr): |
| 162 | + yfull_recurse_file(gr, 0) |
| 163 | + |
| 164 | + |
| 165 | + |
| 166 | + |
| 167 | + |
| 168 | +# |
| 169 | +def import_ftdna_chart(fname, info=''): |
| 170 | + with open(fname) as f: |
| 171 | + print('Importing file: ' +fname) |
| 172 | + soup = BeautifulSoup(f.read(), features="html.parser") |
| 173 | + |
| 174 | + #rows = soup.find('div', id='MainContent_color1_GridView1').find('table').find_all("tr") |
| 175 | + #rows = soup.find('table').find_all("tr") |
| 176 | + rows = soup.find('div', {"id" : re.compile('MainContent.*')}).find('table').find_all("tr") |
| 177 | + |
| 178 | + kiti = -1 |
| 179 | + pati = -1 |
| 180 | + coui = -1 |
| 181 | + gri = -1 |
| 182 | + row = rows[0] |
| 183 | + ths = row.find_all("th") |
| 184 | + for i, th in enumerate(ths): |
| 185 | + if 'Kit' in th.get_text(): |
| 186 | + kiti = i |
| 187 | + if 'Paternal' in th.get_text(): |
| 188 | + pati = i |
| 189 | + if 'Country' in th.get_text(): |
| 190 | + coui = i |
| 191 | + if 'Haplogroup' in th.get_text(): |
| 192 | + gri = i |
| 193 | + for row in rows: |
| 194 | + tds = row.find_all("td") |
| 195 | + if len(tds)>1: |
| 196 | + kit='' |
| 197 | + pat='' |
| 198 | + cou='' |
| 199 | + gr='' |
| 200 | + kit = tds[kiti].get_text().strip() |
| 201 | + if pati >= 0: |
| 202 | + pat = tds[pati].get_text().strip() |
| 203 | + if coui >= 0: |
| 204 | + cou = tds[coui].get_text().strip() |
| 205 | + gr = tds[gri].get_text().strip() |
| 206 | + if not gr: |
| 207 | + continue |
| 208 | + #if 'MIN' in kit or 'MAX' in kit or 'MODE' in kit: |
| 209 | + # continue |
| 210 | + print(kit, pat, gr) |
| 211 | + anno = { |
| 212 | + "g": gr, |
| 213 | + "txt": 'FTDNA: %s %s %s'%(kit, pat, info) |
| 214 | + } |
| 215 | + annos.append(anno) |
| 216 | + |
| 217 | +def save_anno(fname): |
| 218 | + jroot={ |
| 219 | + 'info': 'haploy_anno_iport.py', |
| 220 | + 'annotation': annos } |
| 221 | + with open(fname, 'w') as f: |
| 222 | + json.dump(jroot, f, indent=1); |
| 223 | + |
| 224 | + |
| 225 | +# Example annotations - it probably doesn't make sense for everyone to import every project |
| 226 | + |
| 227 | +annos=[] |
| 228 | +import_yfull_tree('A00') |
| 229 | +import_yfull_tree('A0-T') |
| 230 | +#import_yfull_tree('N-FGC28435') |
| 231 | +#import_yfull_tree('N') |
| 232 | +save_anno('haploy_annodb_yfull.txt') |
| 233 | + |
| 234 | +annos=[] |
| 235 | +import_ftdna_chart('ftdna/FamilyTreeDNA - Estonia.htm', '[Estonia]') |
| 236 | +import_ftdna_chart('ftdna/FamilyTreeDNA - Saami Project.htm', '[Saami]') |
| 237 | +import_ftdna_chart('ftdna/FamilyTreeDNA - I1 Suomi Finland & N-CTS8565 -projekti.htm', '[I1 Suomi]') |
| 238 | +import_ftdna_chart('ftdna/FamilyTreeDNA - Finland DNA Project.htm', '[FinlandDNA]') |
| 239 | +import_ftdna_chart('ftdna/FamilyTreeDNA - RussiaDNA Project.htm', '[RussiaDNA]') |
| 240 | +import_ftdna_chart('ftdna/FamilyTreeDNA - R1a1a and Subclades Y-DNA Project.htm', '[R1a1a]') |
| 241 | +save_anno('haploy_annodb_ftdnatest.txt') |
0 commit comments