Showing
4 changed files
with
252 additions
and
0 deletions
.idea/vcs.xml
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | +import re | ||
7 | + | ||
8 | +__author__ = 'CMendezC' | ||
9 | + | ||
10 | +# Objective: extract manually tagged growth conditions. | ||
11 | + | ||
12 | +# Parameters: | ||
13 | +# 1) --inputPath input path | ||
14 | +# 2) --outputPath output path | ||
15 | + | ||
16 | +# Ouput: | ||
17 | +# 1) Tab separated file | ||
18 | + | ||
19 | +# Execution: | ||
20 | +# python extract-manually-tagged-gcs.py | ||
21 | +# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data | ||
22 | +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs | ||
23 | +# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs | ||
24 | + | ||
25 | +########################################################### | ||
26 | +# MAIN PROGRAM # | ||
27 | +########################################################### | ||
28 | + | ||
29 | +if __name__ == "__main__": | ||
30 | + # Parameter definition | ||
31 | + parser = OptionParser() | ||
32 | + parser.add_option("--inputPath", dest="inputPath", | ||
33 | + help="Path to read input files", metavar="PATH") | ||
34 | + parser.add_option("--outputPath", dest="outputPath", | ||
35 | + help="Path to place output files", metavar="PATH") | ||
36 | + | ||
37 | + (options, args) = parser.parse_args() | ||
38 | + if len(args) > 0: | ||
39 | + parser.error("None parameter entered.") | ||
40 | + sys.exit(1) | ||
41 | + | ||
42 | + # Printing parameter values | ||
43 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
44 | + print("Path to read input files: " + str(options.inputPath)) | ||
45 | + print("Path to place output files: " + str(options.outputPath)) | ||
46 | + | ||
47 | + hashGcs = {} | ||
48 | + regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<') | ||
49 | + regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$') | ||
50 | + regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$') | ||
51 | + # Walk directory to read files | ||
52 | + for path, dirs, files in os.walk(options.inputPath): | ||
53 | + for f in files: | ||
54 | + if f.endswith("_family.xml"): | ||
55 | + print("Processing...{}/{}".format(options.inputPath, f)) | ||
56 | + with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile: | ||
57 | + for line in iFile: | ||
58 | + line = line.strip('\n') | ||
59 | + result = regexSerie.match(line) | ||
60 | + if result: | ||
61 | + serie = result.group('serie') | ||
62 | + if serie in hashGcs: | ||
63 | + print("WARNING! duplicate serie") | ||
64 | + else: | ||
65 | + hashGcs[serie] = {} | ||
66 | + continue | ||
67 | + result = regexSample.match(line) | ||
68 | + if result: | ||
69 | + sample = result.group('sample') | ||
70 | + if sample in hashGcs[serie]: | ||
71 | + print("WARNING! duplicate sample") | ||
72 | + else: | ||
73 | + hashGcs[serie][sample] = {} | ||
74 | + # hashGcs[serie] = hashSample | ||
75 | + #prevSample = sample | ||
76 | + continue | ||
77 | + result = regexTagContent.finditer(line) | ||
78 | + for m in result: | ||
79 | + tag = m.group('tag') | ||
80 | + content = m.group('content') | ||
81 | + content = content.strip() | ||
82 | + print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace'))) | ||
83 | + if tag in hashGcs[serie][sample]: | ||
84 | + if content in hashGcs[serie][sample][tag]: | ||
85 | + print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace'))) | ||
86 | + pass # GC content already in hash | ||
87 | + else: | ||
88 | + print("New content: {}".format(content)) | ||
89 | + hashGcs[serie][sample][tag].append(content) | ||
90 | + print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag])) | ||
91 | + else: | ||
92 | + hashGcs[serie][sample][tag] = [content] | ||
93 | + print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace'))) | ||
94 | + # print(hashGcs) | ||
95 | + tags = ["Technique", "Orgn", "Strain", "Substrain", "Gversion", "Gtype", "Phase", "Phase", "Air", "Med", "Temp", "Supp"] | ||
96 | + with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile: | ||
97 | + output = 'Serie\tSample\t' | ||
98 | + for tag in tags: | ||
99 | + output = output + tag + '\t' | ||
100 | + output = output.rstrip('\t') | ||
101 | + oFile.write(output + "\n") | ||
102 | + for serie, hashSample in hashGcs.items(): | ||
103 | + print("Serie: {}".format(serie)) | ||
104 | + for sample, hashTag in sorted(hashSample.items()): | ||
105 | + print("\tSample: {}".format(sample)) | ||
106 | + pTags = [] | ||
107 | + for tag in tags: | ||
108 | + if tag in hashTag: | ||
109 | + pTags.append(', '.join(hashTag[tag])) | ||
110 | + else: | ||
111 | + pTags.append('') | ||
112 | + | ||
113 | + output = '{}\t{}\t'.format(serie, sample) | ||
114 | + for tag in pTags: | ||
115 | + output = output + tag + '\t' | ||
116 | + output = output.rstrip('\t') | ||
117 | + oFile.write(output + "\n") | ||
118 | + # oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp)) | ||
119 | + # for tag, listContent in sorted(hashTag.items()): | ||
120 | + # print("\t\tTag: {}".format(tag)) | ||
121 | + # for content in sorted(listContent): | ||
122 | + # print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace'))) | ||
123 | + # # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace'))) | ||
124 | + # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content)) | ||
125 | + | ||
126 | + | ||
127 | + |
preprocessing-data/gzip-2-soft.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | +import gzip | ||
7 | +import shutil | ||
8 | + | ||
9 | +__author__ = 'CMendezC' | ||
10 | + | ||
11 | +# Objective: uncompress gzip soft file to text soft file | ||
12 | + | ||
13 | +# Parameters: | ||
14 | +# 1) --inputPath input path | ||
15 | +# 2) --outputPath output path | ||
16 | + | ||
17 | +# Ouput: | ||
18 | +# 1) Text soft file | ||
19 | + | ||
20 | +# Execution: | ||
21 | +# python gzip-2-soft.py | ||
22 | +# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data | ||
23 | +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data | ||
24 | +# python gzip-2-soft.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data | ||
25 | + | ||
26 | +########################################################### | ||
27 | +# MAIN PROGRAM # | ||
28 | +########################################################### | ||
29 | + | ||
30 | +if __name__ == "__main__": | ||
31 | + # Parameter definition | ||
32 | + parser = OptionParser() | ||
33 | + parser.add_option("--inputPath", dest="inputPath", | ||
34 | + help="Path to read input files", metavar="PATH") | ||
35 | + parser.add_option("--outputPath", dest="outputPath", | ||
36 | + help="Path to place output files", metavar="PATH") | ||
37 | + | ||
38 | + (options, args) = parser.parse_args() | ||
39 | + if len(args) > 0: | ||
40 | + parser.error("None parameter entered.") | ||
41 | + sys.exit(1) | ||
42 | + | ||
43 | + # Printing parameter values | ||
44 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
45 | + print("Path to read input files: " + str(options.inputPath)) | ||
46 | + print("Path to place output files: " + str(options.outputPath)) | ||
47 | + | ||
48 | + # Walk directory to read files | ||
49 | + for path, dirs, files in os.walk(options.inputPath): | ||
50 | + for f in files: | ||
51 | + if f.endswith(".gz"): | ||
52 | + print("Processing...{}/{}".format(options.inputPath, f)) | ||
53 | + try: | ||
54 | + with gzip.open(os.path.join(options.inputPath, f), 'rb') as f_in: | ||
55 | + with open(os.path.join(options.outputPath, f.replace('.soft.gz', '.txt')), 'wb') as f_out: | ||
56 | + shutil.copyfileobj(f_in, f_out) | ||
57 | + except: | ||
58 | + pass |
preprocessing-data/soft-2-xml.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | + | ||
7 | +__author__ = 'CMendezC' | ||
8 | + | ||
9 | +# Objective: convert soft file to XML file: | ||
10 | +# include headings, tags, substitute & | ||
11 | + | ||
12 | +# Parameters: | ||
13 | +# 1) --inputPath input path | ||
14 | +# 2) --outputPath output path | ||
15 | + | ||
16 | +# Ouput: | ||
17 | +# 1) XML File with soft file content | ||
18 | + | ||
19 | +# Execution: | ||
20 | +# python soft-2-xml.py | ||
21 | +# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data | ||
22 | +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data | ||
23 | +# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data | ||
24 | + | ||
25 | +########################################################### | ||
26 | +# MAIN PROGRAM # | ||
27 | +########################################################### | ||
28 | + | ||
29 | +if __name__ == "__main__": | ||
30 | + # Parameter definition | ||
31 | + parser = OptionParser() | ||
32 | + parser.add_option("--inputPath", dest="inputPath", | ||
33 | + help="Path to read input files", metavar="PATH") | ||
34 | + parser.add_option("--outputPath", dest="outputPath", | ||
35 | + help="Path to place output files", metavar="PATH") | ||
36 | + | ||
37 | + (options, args) = parser.parse_args() | ||
38 | + if len(args) > 0: | ||
39 | + parser.error("None parameter entered.") | ||
40 | + sys.exit(1) | ||
41 | + | ||
42 | + # Printing parameter values | ||
43 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
44 | + print("Path to read input files: " + str(options.inputPath)) | ||
45 | + print("Path to place output files: " + str(options.outputPath)) | ||
46 | + | ||
47 | + # Walk directory to read files | ||
48 | + for path, dirs, files in os.walk(options.inputPath): | ||
49 | + for f in files: | ||
50 | + if f.endswith("_family.txt"): | ||
51 | + print("Processing...{}/{}".format(options.inputPath, f)) | ||
52 | + softText = '' | ||
53 | + with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile: | ||
54 | + with open(os.path.join(options.outputPath, f.replace(".txt", ".xml")), "w", | ||
55 | + encoding="utf-8") as oFile: | ||
56 | + oFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n<gse xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\nxsi:noNamespaceSchemaLocation=\"esquema-gcs.xsd\">\n\n") | ||
57 | + for line in iFile: | ||
58 | + line = line.replace("&", "&") | ||
59 | + oFile.write(line) | ||
60 | + oFile.write("\n</gse>\n") | ||
61 | + |
-
Please register or login to post a comment