-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathann2expann.pl
More file actions
executable file
·148 lines (133 loc) · 11.7 KB
/
Copy pathann2expann.pl
File metadata and controls
executable file
·148 lines (133 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!usr/bin/perl
#
# Expand Illumina Methylation 450K Annotation (hg19) file
#
# Description: Expands Illumina Methylation 450K Annotation (hg19):convert IMPUTE2 data to PLINK-format, so 3 dosages (AA, AB, BB) to 1
# - each line is split on GeneName, GeneID, and GeneGroup column by ";"
# - than each each of these lines is printed again
#
# Written by: Michal Mokry & Sander W. van der Laan; UMC Utrecht, Utrecht, the
# Netherlands, m.mokry@umcutrecht.nl or s.w.vanderlaan-2@umcutrecht.nl.
# Version: 1.0
# Update date: 2018-02-09
#
# Usage: ann2expann.pl [INPUT] [GZIP/NORM] [OUTPUT]
#
#
# HEAD of Illumina Methylation 450K Annotation (hg19) file *BEFORE* expansion
#
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
# CpG chr pos strand Name AddressA AddressB ProbeSeqA ProbeSeqB Type NextBase Color Forward_Sequence SourceSeq Random_Loci Methyl27_Loci UCSC_RefGene_Name UCSC_RefGene_Accession UCSC_RefGene_Group Phantom DMR Enhancer HMM_Island Regulatory_Feature_Name Regulatory_Feature_Group DHS Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs SBE_maf Probe_SNPs Probe_SNPs_10 Islands_Name Relation_to_Island
# cg00000029 chr16 53468112 + cg00000029 14782418 AACTATACTAACRAAAAAATATCCAAAAAACACTAACRTATAAAAATTTC II TTTTTTAGATAAGGATATCCAGGCGATGAGGAAGTTTTACTTCTGGGAACAGCCTGGATA[CG]AAACCTTCACACGTCAGTGTCTTTTGGACATTTTCTCGTCAGTACAGCCCTGTTGAATGT GCTGTACTGACGAGAAAATGTCCAAAAGACACTGACGTGTGAAGGTTTCG RBL2 NM_005611 TSS1500 16:53467838-53469685 Promoter_Associated TRUE NA NA NA NA NA NA chr16:53468284-53469209 N_Shore
# cg00000108 chr3 37459206 + cg00000108 12709357 ATACAATAAAACAAACCTAAAATAATCCTAACTCCRCTATCATCCTAACC II TCCATTTTGAAGGAAAAAAATGAAGGCTCTGAAAGTGTAAATCGCTTACTGAAGGGCACA[CG]GCCAGGATGACAGCGGAGCCAGGATCACCCCAGGTCTGTCTCATTGCATATGTCATGGCT CGGCCAGGATGACAGCGGAGCCAGGATCACCCCAGGTCTGTCTCATTGCA C3orf35;C3orf35 NM_178339;NM_178342 Body;3'UTR rs9857774 0.03135 NA NA NA NA rs9857774 OpenSea
# cg00000109 chr3 171916037 + cg00000109 59755374 CAATACTAACAAACACATATACCCCCCCACAAATCTTAACTTCTAAATAC II GCCTTAGTCCTGAATGAGCCATTTCTCTAAGAAGTCCTGGCTTCTTTTTTAATAGAGAAT[CG]TATTTAGAAGCCAAGATCTGTGGGGGGGTACATGTGCCTGTTAGTATTGCAGTTGTGCCT AATACTAACAGGCACATGTACCCCCCCACAGATCTTGGCTTCTAAATACG FNDC3B;FNDC3B NM_001135095;NM_022763 Body;Body low-CpG:173398671-173398760 NA NA NA NA NA NA rs9864492 OpenSea
# cg00000165 chr1 91194674 - cg00000165 12637463 CAAAATCTATTAATACAATAACTTTTAATAAAACAACTAAAACACACATC II CTAAGTGCAGTCAGGATCTGTTAGTACAGTGGCTTTTGATGGAACAGCTGAGGCACACAT[CG]CCCGTGGCATGGACTCCGGGGCCGAACGCTCACGACCAAGACTTTTGCCCTTTTGAAATG AGGATCTGTTAGTACAGTGGCTTTTGATGGAACAGCTGAGGCACACATCG CDMR TRUE 1:90967262-90967361 NA NA NA NA NA NA chr1:91190489-91192804 S_Shore
# cg00000236 chr8 42263294 - cg00000236 12649348 TATAACRTCATATTAAAAAAAACRATCTAACCCACCAATTTATACATCAC II CTCAGCGACAGTGTAGCGTCATGTTAGAGGAGACGATCTGACCCACCAGTTTGTACATCA[CG]TCCTGCATGTCCCACACCATTTTTTCATGACCTTGTAATATACTGGTCTCTGTGCTATAG GTAGCGTCATGTTAGAGGAGACGATCTGACCCACCAGTTTGTACATCACG VDAC3;VDAC3 NM_005662;NM_001135694 3'UTR;3'UTR NA NA NA NA NA NA OpenSea
# cg00000289 chr14 69341139 + cg00000289 18766346 ATCTACTATATTCATTTCTCCAATCTCATATCCATTTTAATATAAAAATC II CAAGTGAGCTAGCAAACACACATGCACCAATGTGCCTTTTGACAAGAGTACCCCCTACCC[CG]ACTCCCACACCAAAATGGACATGAGATTGGAGAAATGAATACAGCAGATGGAACAGATAG TCTGCTGTATTCATTTCTCCAATCTCATGTCCATTTTGGTGTGGGAGTCG ACTN1;ACTN1;ACTN1 NM_001130005;NM_001130004;NM_001102 3'UTR;3'UTR;3'UTR NA NA NA NA NA NA chr14:69341427-69341820 N_Shore
# cg00000292 chr16 28890100 + cg00000292 43764508 AAAACATTAATTACCAACCRCTCTTCCAAAAAACACTTACCATTAAAACC II TGGGGTGAGTGAGACCACGGGCCTCACCCCGGACCAAGTTAAGCGGAATCTGGAGAAATA[CG]GCCTCAATGGTAAGTGTCCCTTGGAAGAGCGGCTGGTAATTAATGCCCTCCTGCACCCCC CGGCCTCAATGGTAAGTGTCCCTTGGAAGAGCGGCTGGTAATTAATGCCC TRUE ATP2A1;ATP2A1 NM_004320;NM_173201 1stExon;1stExon rs62037371 0.342008 NA NA NA NA rs62037371 chr16:28890954-28891868 N_Shore
# cg00000321 chr8 41167802 - cg00000321 62789509 ATAAATACCCAATAAACCTAACTAAACTCCCTAAAAAACRAAACRAAAAC II GAGGTCTGCTTGTAAATACCCAGTGGGCCTGGCTGGGCTCCCTGGAAGGCGAGGCGAAGG[CG]CAGTTGGAGCTGTTTGCTGTGAGCAGCACCTCTCCAGGTGGGGCCGCCCATGGTGGCCCT CGCCTTCGCCTCGCCTTCCAGGGAGCCCAGCCAGGCCCACTGGGTATTTA SFRP1 NM_003012 TSS1500 NA NA NA NA NA NA chr8:41165852-41167140 S_Shore
# cg00000363 chr1 230560793 + cg00000363 16661505 RTCTTAACTTAACTTAATTTTCTCCTTAATCTAAAAAACTTTCCCTATCC II CTGCCCAATCGGTCCCTTCCTTCACTCCTCCCCATTCTTAACAAGAGATCTGGAATGGCG[CG]GACAGGGAAAGTTTCTCAGATTAAGGAGAAAACTAAGCCAAGTCAAGACGCCGCGGGTGG TCTTGACTTGGCTTAGTTTTCTCCTTAATCTGAGAAACTTTCCCTGTCCG 1:228627033-228629325 NA NA NA NA NA NA chr1:230561103-230562702 N_Shore
#
#
# HEAD of Illumina Methylation 450K Annotation (hg19) file *AFTER* expansion
#
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
# CpG chr pos strand Name AddressA AddressB ProbeSeqA ProbeSeqB Type NextBase Color Forward_Sequence SourceSeq Random_Loci Methyl27_Loci UCSC_RefGene_Name UCSC_RefGene_Accession UCSC_RefGene_Group Phantom DMR Enhancer HMM_Island Regulatory_Feature_Name Regulatory_Feature_Group DHS Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs SBE_maf Probe_SNPs Probe_SNPs_10 Islands_Name Relation_to_Island
# cg00000029 chr16 53468112 + cg00000029 14782418 AACTATACTAACRAAAAAATATCCAAAAAACACTAACRTATAAAAATTTC II TTTTTTAGATAAGGATATCCAGGCGATGAGGAAGTTTTACTTCTGGGAACAGCCTGGATA[CG]AAACCTTCACACGTCAGTGTCTTTTGGACATTTTCTCGTCAGTACAGCCCTGTTGAATGT GCTGTACTGACGAGAAAATGTCCAAAAGACACTGACGTGTGAAGGTTTCG RBL2 NM_005611 TSS1500 16:53467838-53469685 Promoter_Associated TRUE NA NA NA NA NA NA chr16:53468284-53469209 N_Shore
# cg00000108 chr3 37459206 + cg00000108 12709357 ATACAATAAAACAAACCTAAAATAATCCTAACTCCRCTATCATCCTAACC II TCCATTTTGAAGGAAAAAAATGAAGGCTCTGAAAGTGTAAATCGCTTACTGAAGGGCACA[CG]GCCAGGATGACAGCGGAGCCAGGATCACCCCAGGTCTGTCTCATTGCATATGTCATGGCT CGGCCAGGATGACAGCGGAGCCAGGATCACCCCAGGTCTGTCTCATTGCA C3orf35 NM_178339 Body rs9857774 0.03135 NA NA NA NA rs9857774 OpenSea
# cg00000108 chr3 37459206 + cg00000108 12709357 ATACAATAAAACAAACCTAAAATAATCCTAACTCCRCTATCATCCTAACC II TCCATTTTGAAGGAAAAAAATGAAGGCTCTGAAAGTGTAAATCGCTTACTGAAGGGCACA[CG]GCCAGGATGACAGCGGAGCCAGGATCACCCCAGGTCTGTCTCATTGCATATGTCATGGCT CGGCCAGGATGACAGCGGAGCCAGGATCACCCCAGGTCTGTCTCATTGCA C3orf35 NM_178342 3'UTR rs9857774 0.03135 NA NA NA NA rs9857774 OpenSea
# cg00000109 chr3 171916037 + cg00000109 59755374 CAATACTAACAAACACATATACCCCCCCACAAATCTTAACTTCTAAATAC II GCCTTAGTCCTGAATGAGCCATTTCTCTAAGAAGTCCTGGCTTCTTTTTTAATAGAGAAT[CG]TATTTAGAAGCCAAGATCTGTGGGGGGGTACATGTGCCTGTTAGTATTGCAGTTGTGCCT AATACTAACAGGCACATGTACCCCCCCACAGATCTTGGCTTCTAAATACG FNDC3B NM_001135095 Body low-CpG:173398671-173398760 NA NA NA NA NA NA rs9864492 OpenSea
# cg00000109 chr3 171916037 + cg00000109 59755374 CAATACTAACAAACACATATACCCCCCCACAAATCTTAACTTCTAAATAC II GCCTTAGTCCTGAATGAGCCATTTCTCTAAGAAGTCCTGGCTTCTTTTTTAATAGAGAAT[CG]TATTTAGAAGCCAAGATCTGTGGGGGGGTACATGTGCCTGTTAGTATTGCAGTTGTGCCT AATACTAACAGGCACATGTACCCCCCCACAGATCTTGGCTTCTAAATACG FNDC3B NM_022763 Body low-CpG:173398671-173398760 NA NA NA NA NA NA rs9864492 OpenSea
# cg00000165 chr1 91194674 - cg00000165 12637463 CAAAATCTATTAATACAATAACTTTTAATAAAACAACTAAAACACACATC II CTAAGTGCAGTCAGGATCTGTTAGTACAGTGGCTTTTGATGGAACAGCTGAGGCACACAT[CG]CCCGTGGCATGGACTCCGGGGCCGAACGCTCACGACCAAGACTTTTGCCCTTTTGAAATG AGGATCTGTTAGTACAGTGGCTTTTGATGGAACAGCTGAGGCACACATCG CDMR TRUE 1:90967262-90967361 NA NA NA NA NA NA chr1:91190489-91192804 S_Shore
# cg00000236 chr8 42263294 - cg00000236 12649348 TATAACRTCATATTAAAAAAAACRATCTAACCCACCAATTTATACATCAC II CTCAGCGACAGTGTAGCGTCATGTTAGAGGAGACGATCTGACCCACCAGTTTGTACATCA[CG]TCCTGCATGTCCCACACCATTTTTTCATGACCTTGTAATATACTGGTCTCTGTGCTATAG GTAGCGTCATGTTAGAGGAGACGATCTGACCCACCAGTTTGTACATCACG VDAC3 NM_005662 3'UTR NA NA NA NA NA NA OpenSea
# cg00000236 chr8 42263294 - cg00000236 12649348 TATAACRTCATATTAAAAAAAACRATCTAACCCACCAATTTATACATCAC II CTCAGCGACAGTGTAGCGTCATGTTAGAGGAGACGATCTGACCCACCAGTTTGTACATCA[CG]TCCTGCATGTCCCACACCATTTTTTCATGACCTTGTAATATACTGGTCTCTGTGCTATAG GTAGCGTCATGTTAGAGGAGACGATCTGACCCACCAGTTTGTACATCACG VDAC3 NM_001135694 3'UTR NA NA NA NA NA NA OpenSea
# cg00000289 chr14 69341139 + cg00000289 18766346 ATCTACTATATTCATTTCTCCAATCTCATATCCATTTTAATATAAAAATC II CAAGTGAGCTAGCAAACACACATGCACCAATGTGCCTTTTGACAAGAGTACCCCCTACCC[CG]ACTCCCACACCAAAATGGACATGAGATTGGAGAAATGAATACAGCAGATGGAACAGATAG TCTGCTGTATTCATTTCTCCAATCTCATGTCCATTTTGGTGTGGGAGTCG ACTN1 NM_001130005 3'UTR NA NA NA NA NA NA chr14:69341427-69341820 N_Shore
# Starting expanding
print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n";
print "+ 450K ANNOTATION EXPANDER +\n";
print "+ Version 1.0 +\n";
print "+ 09-02-2018 +\n";
print "+ Written by: Michal Mokry & Sander W. van der Laan +\n";
print "+ +\n";
print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n";
print "\n";
print "Hello. I am starting the expand the Illumina Methylation 450K Annotation (hg19) file.\n";
my $time = localtime; # scalar context
print "The current date and time is: $time.\n";
print "\n";
use strict;
use warnings;
# One argument is required:
# - the input file (IN)
# - whether the input file is zipped (GZIP/NORM)
# - the output file (OUT)
my $file = $ARGV[0]; # first argument
my $zipped = $ARGV[1]; # second argument
my $output = $ARGV[2]; # third argument
# IF/ELSE STATEMENTS
if ($zipped eq "GZIP") {
open (IN, "gunzip -c $file |") or die "* ERROR: Couldn't open input file: $!";
} elsif ($zipped eq "NORM") {
open (IN, $file) or die "* ERROR: Couldn't open input file: $!";
} else {
print "* ERROR: Please, indicate the type of input file: gzipped [GZIP] or uncompressed [NORM]!\n";
print " (Arguments are case-sensitive.)\n";
}
open (OUT, ">$output") or die "* ERROR: Couldn't open output file: $!";
my $line = <IN>;
print OUT $line;
while ($line = <IN>){
my @r = split(/\t/,$line);
my @gene = split(/;/,$r[16]); # UCSC_RefGene_Name
my @id = split(/;/,$r[17]); # UCSC_RefGene_Accession
my @ann = split(/;/,$r[18]); # UCSC_RefGene_Group
if (scalar(@gene)>0){
for (my$a=0;$a<(scalar(@gene));$a++){
$r[16]=$gene[$a];
$r[17]=$id[$a];
$r[18]=$ann[$a];
$line = join ("\t",@r);
print OUT $line;
}
}
else {
print OUT $line;
}
}
close IN; # stop reading the input-file
close OUT; # stop writing the output-file
print "Wow. That was a lot of work. I'm glad it's done. Let's have beer, buddy!\n";
my $newtime = localtime; # scalar context
print "The current date and time is: $newtime.\n";
print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n";
# ORIGINAL CODE
# #!/usr/bin/perl -w
# use strict;
#
# open IN, $ARGV[0] or die;
# open OUT, '>'.$ARGV[0].'.expanded.txt' or die;
#
# my $line = <IN>;
# print OUT $line;
#
# while ($line = <IN>){
# my @r = split(/\t/,$line);
# my @gene = split(/;/,$r[16]); # UCSC_RefGene_Name
# my @id = split(/;/,$r[17]); # UCSC_RefGene_Accession
# my @ann = split(/;/,$r[18]); # UCSC_RefGene_Group
# if (scalar(@gene)>0){
# for (my$a=0;$a<(scalar(@gene));$a++){
# $r[16]=$gene[$a];
# $r[17]=$id[$a];
# $r[18]=$ann[$a];
# $line = join ("\t",@r);
# print OUT $line;
# }
# }
# else {
# print OUT $line;
# }
# }
#
# close OUT;
# exit;