Carlos-Francisco Méndez-Cruz

Sc

<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
import re
__author__ = 'CMendezC'
# Objective: extract manually tagged growth conditions.
# Parameters:
# 1) --inputPath input path
# 2) --outputPath output path
# Ouput:
# 1) Tab separated file
# Execution:
# python extract-manually-tagged-gcs.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs
# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place output files", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place output files: " + str(options.outputPath))
hashGcs = {}
regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<')
regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$')
regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$')
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
for f in files:
if f.endswith("_family.xml"):
print("Processing...{}/{}".format(options.inputPath, f))
with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
for line in iFile:
line = line.strip('\n')
result = regexSerie.match(line)
if result:
serie = result.group('serie')
if serie in hashGcs:
print("WARNING! duplicate serie")
else:
hashGcs[serie] = {}
continue
result = regexSample.match(line)
if result:
sample = result.group('sample')
if sample in hashGcs[serie]:
print("WARNING! duplicate sample")
else:
hashGcs[serie][sample] = {}
# hashGcs[serie] = hashSample
#prevSample = sample
continue
result = regexTagContent.finditer(line)
for m in result:
tag = m.group('tag')
content = m.group('content')
content = content.strip()
print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
if tag in hashGcs[serie][sample]:
if content in hashGcs[serie][sample][tag]:
print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace')))
pass # GC content already in hash
else:
print("New content: {}".format(content))
hashGcs[serie][sample][tag].append(content)
print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag]))
else:
hashGcs[serie][sample][tag] = [content]
print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace')))
# print(hashGcs)
tags = ["Technique", "Orgn", "Strain", "Substrain", "Gversion", "Gtype", "Phase", "Phase", "Air", "Med", "Temp", "Supp"]
with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile:
output = 'Serie\tSample\t'
for tag in tags:
output = output + tag + '\t'
output = output.rstrip('\t')
oFile.write(output + "\n")
for serie, hashSample in hashGcs.items():
print("Serie: {}".format(serie))
for sample, hashTag in sorted(hashSample.items()):
print("\tSample: {}".format(sample))
pTags = []
for tag in tags:
if tag in hashTag:
pTags.append(', '.join(hashTag[tag]))
else:
pTags.append('')
output = '{}\t{}\t'.format(serie, sample)
for tag in pTags:
output = output + tag + '\t'
output = output.rstrip('\t')
oFile.write(output + "\n")
# oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp))
# for tag, listContent in sorted(hashTag.items()):
# print("\t\tTag: {}".format(tag))
# for content in sorted(listContent):
# print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace')))
# # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
# oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content))
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
import gzip
import shutil
__author__ = 'CMendezC'
# Objective: uncompress gzip soft file to text soft file
# Parameters:
# 1) --inputPath input path
# 2) --outputPath output path
# Ouput:
# 1) Text soft file
# Execution:
# python gzip-2-soft.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
# python gzip-2-soft.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place output files", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place output files: " + str(options.outputPath))
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
for f in files:
if f.endswith(".gz"):
print("Processing...{}/{}".format(options.inputPath, f))
try:
with gzip.open(os.path.join(options.inputPath, f), 'rb') as f_in:
with open(os.path.join(options.outputPath, f.replace('.soft.gz', '.txt')), 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
except:
pass
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
__author__ = 'CMendezC'
# Objective: convert soft file to XML file:
# include headings, tags, substitute &
# Parameters:
# 1) --inputPath input path
# 2) --outputPath output path
# Ouput:
# 1) XML File with soft file content
# Execution:
# python soft-2-xml.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place output files", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place output files: " + str(options.outputPath))
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
for f in files:
if f.endswith("_family.txt"):
print("Processing...{}/{}".format(options.inputPath, f))
softText = ''
with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
with open(os.path.join(options.outputPath, f.replace(".txt", ".xml")), "w",
encoding="utf-8") as oFile:
oFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n<gse xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\nxsi:noNamespaceSchemaLocation=\"esquema-gcs.xsd\">\n\n")
for line in iFile:
line = line.replace("&", "&amp;")
oFile.write(line)
oFile.write("\n</gse>\n")