extract-manually-tagged-gcs.py
7.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
import re
__author__ = 'CMendezC'
# Objective: extract manually tagged growth conditions.
# Parameters:
# 1) --inputPath input path
# 2) --outputPath output path
# Ouput:
# 1) Tab separated file
# Execution:
# python extract-manually-tagged-gcs.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place output files", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place output files: " + str(options.outputPath))
hashGcs = {}
regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<')
regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$')
regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$')
tags = ["Technique", "Orgn", "Strain", "Substrain", "Gversion", "Gtype", "Phase", "Air", "Med", "Temp", "Supp",
"pH", "Press", "OD", "Rate", "Vess", "Agit", "Name", "Anti"]
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
for f in files:
if f.endswith("_family.xml"):
print("Processing...{} {}".format(options.inputPath, f))
#with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
with open(os.path.join(options.inputPath, f), "r") as iFile:
for line in iFile:
line = line.strip('\n')
result = regexSerie.match(line)
if result:
serie = result.group('serie')
if serie in hashGcs:
print("WARNING! duplicate serie")
else:
hashGcs[serie] = {}
continue
result = regexSample.match(line)
if result:
sample = result.group('sample')
if sample in hashGcs[serie]:
print("WARNING! duplicate sample")
else:
hashGcs[serie][sample] = {}
# hashGcs[serie] = hashSample
#prevSample = sample
continue
result = regexTagContent.finditer(line)
for m in result:
tag = m.group('tag')
content = m.group('content')
content = content.strip()
content = content.replace("&", "&")
content = content.replace("<", "<")
content = content.replace(">", ">")
content = content.replace(""", "\"")
content = content.replace("'", "\'")
#print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
if tag in hashGcs[serie][sample]:
if content in hashGcs[serie][sample][tag]:
#print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace')))
pass # GC content already in hash
else:
print("New content: {}".format(content))
hashGcs[serie][sample][tag].append(content)
print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag]))
else:
hashGcs[serie][sample][tag] = [content]
#print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace')))
# print(hashGcs)
#with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile:
with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w") as oFile:
output = '"Serie","Sample",'
for tag in tags:
output = output + '"' + tag + '",'
output = output.rstrip(',')
oFile.write(output + "\n")
for serie, hashSample in hashGcs.items():
print("Serie: {}".format(serie))
for sample, hashTag in sorted(hashSample.items()):
print("\tSample: {}".format(sample))
pTags = []
for tag in tags:
if tag in hashTag:
pTags.append(', '.join(hashTag[tag]))
else:
pTags.append('')
output = '"{}","{}",'.format(serie, sample)
for tag in pTags:
output = output + '"' + tag + '",'
output = output.rstrip(',')
oFile.write(output + "\n")
# oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp))
# for tag, listContent in sorted(hashTag.items()):
# print("\t\tTag: {}".format(tag))
# for content in sorted(listContent):
# print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace')))
# # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
# oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content))