Carlos-Francisco Méndez-Cruz

Sc

1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<project version="4">
3 + <component name="VcsDirectoryMappings">
4 + <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 + </component>
6 +</project>
...\ No newline at end of file ...\ No newline at end of file
1 +# -*- coding: UTF-8 -*-
2 +
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +import re
7 +
8 +__author__ = 'CMendezC'
9 +
10 +# Objective: extract manually tagged growth conditions.
11 +
12 +# Parameters:
13 +# 1) --inputPath input path
14 +# 2) --outputPath output path
15 +
16 +# Ouput:
17 +# 1) Tab separated file
18 +
19 +# Execution:
20 +# python extract-manually-tagged-gcs.py
21 +# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data
22 +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs
23 +# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs
24 +
25 +###########################################################
26 +# MAIN PROGRAM #
27 +###########################################################
28 +
29 +if __name__ == "__main__":
30 + # Parameter definition
31 + parser = OptionParser()
32 + parser.add_option("--inputPath", dest="inputPath",
33 + help="Path to read input files", metavar="PATH")
34 + parser.add_option("--outputPath", dest="outputPath",
35 + help="Path to place output files", metavar="PATH")
36 +
37 + (options, args) = parser.parse_args()
38 + if len(args) > 0:
39 + parser.error("None parameter entered.")
40 + sys.exit(1)
41 +
42 + # Printing parameter values
43 + print('-------------------------------- PARAMETERS --------------------------------')
44 + print("Path to read input files: " + str(options.inputPath))
45 + print("Path to place output files: " + str(options.outputPath))
46 +
47 + hashGcs = {}
48 + regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<')
49 + regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$')
50 + regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$')
51 + # Walk directory to read files
52 + for path, dirs, files in os.walk(options.inputPath):
53 + for f in files:
54 + if f.endswith("_family.xml"):
55 + print("Processing...{}/{}".format(options.inputPath, f))
56 + with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
57 + for line in iFile:
58 + line = line.strip('\n')
59 + result = regexSerie.match(line)
60 + if result:
61 + serie = result.group('serie')
62 + if serie in hashGcs:
63 + print("WARNING! duplicate serie")
64 + else:
65 + hashGcs[serie] = {}
66 + continue
67 + result = regexSample.match(line)
68 + if result:
69 + sample = result.group('sample')
70 + if sample in hashGcs[serie]:
71 + print("WARNING! duplicate sample")
72 + else:
73 + hashGcs[serie][sample] = {}
74 + # hashGcs[serie] = hashSample
75 + #prevSample = sample
76 + continue
77 + result = regexTagContent.finditer(line)
78 + for m in result:
79 + tag = m.group('tag')
80 + content = m.group('content')
81 + content = content.strip()
82 + print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
83 + if tag in hashGcs[serie][sample]:
84 + if content in hashGcs[serie][sample][tag]:
85 + print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace')))
86 + pass # GC content already in hash
87 + else:
88 + print("New content: {}".format(content))
89 + hashGcs[serie][sample][tag].append(content)
90 + print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag]))
91 + else:
92 + hashGcs[serie][sample][tag] = [content]
93 + print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace')))
94 + # print(hashGcs)
95 + tags = ["Technique", "Orgn", "Strain", "Substrain", "Gversion", "Gtype", "Phase", "Phase", "Air", "Med", "Temp", "Supp"]
96 + with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile:
97 + output = 'Serie\tSample\t'
98 + for tag in tags:
99 + output = output + tag + '\t'
100 + output = output.rstrip('\t')
101 + oFile.write(output + "\n")
102 + for serie, hashSample in hashGcs.items():
103 + print("Serie: {}".format(serie))
104 + for sample, hashTag in sorted(hashSample.items()):
105 + print("\tSample: {}".format(sample))
106 + pTags = []
107 + for tag in tags:
108 + if tag in hashTag:
109 + pTags.append(', '.join(hashTag[tag]))
110 + else:
111 + pTags.append('')
112 +
113 + output = '{}\t{}\t'.format(serie, sample)
114 + for tag in pTags:
115 + output = output + tag + '\t'
116 + output = output.rstrip('\t')
117 + oFile.write(output + "\n")
118 + # oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp))
119 + # for tag, listContent in sorted(hashTag.items()):
120 + # print("\t\tTag: {}".format(tag))
121 + # for content in sorted(listContent):
122 + # print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace')))
123 + # # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
124 + # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content))
125 +
126 +
127 +
1 +# -*- coding: UTF-8 -*-
2 +
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +import gzip
7 +import shutil
8 +
9 +__author__ = 'CMendezC'
10 +
11 +# Objective: uncompress gzip soft file to text soft file
12 +
13 +# Parameters:
14 +# 1) --inputPath input path
15 +# 2) --outputPath output path
16 +
17 +# Ouput:
18 +# 1) Text soft file
19 +
20 +# Execution:
21 +# python gzip-2-soft.py
22 +# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data
23 +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
24 +# python gzip-2-soft.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
25 +
26 +###########################################################
27 +# MAIN PROGRAM #
28 +###########################################################
29 +
30 +if __name__ == "__main__":
31 + # Parameter definition
32 + parser = OptionParser()
33 + parser.add_option("--inputPath", dest="inputPath",
34 + help="Path to read input files", metavar="PATH")
35 + parser.add_option("--outputPath", dest="outputPath",
36 + help="Path to place output files", metavar="PATH")
37 +
38 + (options, args) = parser.parse_args()
39 + if len(args) > 0:
40 + parser.error("None parameter entered.")
41 + sys.exit(1)
42 +
43 + # Printing parameter values
44 + print('-------------------------------- PARAMETERS --------------------------------')
45 + print("Path to read input files: " + str(options.inputPath))
46 + print("Path to place output files: " + str(options.outputPath))
47 +
48 + # Walk directory to read files
49 + for path, dirs, files in os.walk(options.inputPath):
50 + for f in files:
51 + if f.endswith(".gz"):
52 + print("Processing...{}/{}".format(options.inputPath, f))
53 + try:
54 + with gzip.open(os.path.join(options.inputPath, f), 'rb') as f_in:
55 + with open(os.path.join(options.outputPath, f.replace('.soft.gz', '.txt')), 'wb') as f_out:
56 + shutil.copyfileobj(f_in, f_out)
57 + except:
58 + pass
1 +# -*- coding: UTF-8 -*-
2 +
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +
7 +__author__ = 'CMendezC'
8 +
9 +# Objective: convert soft file to XML file:
10 +# include headings, tags, substitute &
11 +
12 +# Parameters:
13 +# 1) --inputPath input path
14 +# 2) --outputPath output path
15 +
16 +# Ouput:
17 +# 1) XML File with soft file content
18 +
19 +# Execution:
20 +# python soft-2-xml.py
21 +# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
22 +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
23 +# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
24 +
25 +###########################################################
26 +# MAIN PROGRAM #
27 +###########################################################
28 +
29 +if __name__ == "__main__":
30 + # Parameter definition
31 + parser = OptionParser()
32 + parser.add_option("--inputPath", dest="inputPath",
33 + help="Path to read input files", metavar="PATH")
34 + parser.add_option("--outputPath", dest="outputPath",
35 + help="Path to place output files", metavar="PATH")
36 +
37 + (options, args) = parser.parse_args()
38 + if len(args) > 0:
39 + parser.error("None parameter entered.")
40 + sys.exit(1)
41 +
42 + # Printing parameter values
43 + print('-------------------------------- PARAMETERS --------------------------------')
44 + print("Path to read input files: " + str(options.inputPath))
45 + print("Path to place output files: " + str(options.outputPath))
46 +
47 + # Walk directory to read files
48 + for path, dirs, files in os.walk(options.inputPath):
49 + for f in files:
50 + if f.endswith("_family.txt"):
51 + print("Processing...{}/{}".format(options.inputPath, f))
52 + softText = ''
53 + with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
54 + with open(os.path.join(options.outputPath, f.replace(".txt", ".xml")), "w",
55 + encoding="utf-8") as oFile:
56 + oFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n<gse xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\nxsi:noNamespaceSchemaLocation=\"esquema-gcs.xsd\">\n\n")
57 + for line in iFile:
58 + line = line.replace("&", "&amp;")
59 + oFile.write(line)
60 + oFile.write("\n</gse>\n")
61 +