extract-manually-tagged-gcs.py
6.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
import re
__author__ = 'CMendezC'
# Objective: extract manually tagged growth conditions.
# Parameters:
# 1) --inputPath input path
# 2) --outputPath output path
# Ouput:
# 1) Tab separated file
# Execution:
# python extract-manually-tagged-gcs.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs
# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place output files", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place output files: " + str(options.outputPath))
hashGcs = {}
regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<')
regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$')
regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$')
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
for f in files:
if f.endswith("_family.xml"):
print("Processing...{}/{}".format(options.inputPath, f))
with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
for line in iFile:
line = line.strip('\n')
result = regexSerie.match(line)
if result:
serie = result.group('serie')
if serie in hashGcs:
print("WARNING! duplicate serie")
else:
hashGcs[serie] = {}
continue
result = regexSample.match(line)
if result:
sample = result.group('sample')
if sample in hashGcs[serie]:
print("WARNING! duplicate sample")
else:
hashGcs[serie][sample] = {}
# hashGcs[serie] = hashSample
#prevSample = sample
continue
result = regexTagContent.finditer(line)
for m in result:
tag = m.group('tag')
content = m.group('content')
content = content.strip()
print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
if tag in hashGcs[serie][sample]:
if content in hashGcs[serie][sample][tag]:
print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace')))
pass # GC content already in hash
else:
print("New content: {}".format(content))
hashGcs[serie][sample][tag].append(content)
print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag]))
else:
hashGcs[serie][sample][tag] = [content]
print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace')))
# print(hashGcs)
tags = ["Technique", "Orgn", "Strain", "Substrain", "Gversion", "Gtype", "Phase", "Phase", "Air", "Med", "Temp", "Supp"]
with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile:
output = 'Serie\tSample\t'
for tag in tags:
output = output + tag + '\t'
output = output.rstrip('\t')
oFile.write(output + "\n")
for serie, hashSample in hashGcs.items():
print("Serie: {}".format(serie))
for sample, hashTag in sorted(hashSample.items()):
print("\tSample: {}".format(sample))
pTags = []
for tag in tags:
if tag in hashTag:
pTags.append(', '.join(hashTag[tag]))
else:
pTags.append('')
output = '{}\t{}\t'.format(serie, sample)
for tag in pTags:
output = output + tag + '\t'
output = output.rstrip('\t')
oFile.write(output + "\n")
# oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp))
# for tag, listContent in sorted(hashTag.items()):
# print("\t\tTag: {}".format(tag))
# for content in sorted(listContent):
# print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace')))
# # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
# oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content))