Estefani Gaytan Nunez

organizacion nueva

Showing 53 changed files with 0 additions and 364 deletions
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 -# -*- coding: UTF-8 -*-
2 -
3 -from optparse import OptionParser
4 -import os
5 -import sys
6 -import re
7 -
8 -__author__ = 'CMendezC'
9 -
10 -# Objective: extract manually tagged growth conditions.
11 -
12 -# Parameters:
13 -# 1) --inputPath input path
14 -# 2) --outputPath output path
15 -
16 -# Ouput:
17 -# 1) Tab separated file
18 -
19 -# Execution:
20 -# python extract-manually-tagged-gcs.py
21 -# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data
22 -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
23 -# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
24 -
25 -# python extract-manually-tagged-gcs.py
26 -# --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs"
27 -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
28 -# python extract-manually-tagged-gcs.py --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs" --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
29 -
30 -###########################################################
31 -# MAIN PROGRAM #
32 -###########################################################
33 -
34 -if __name__ == "__main__":
35 - # Parameter definition
36 - parser = OptionParser()
37 - parser.add_option("--inputPath", dest="inputPath",
38 - help="Path to read input files", metavar="PATH")
39 - parser.add_option("--outputPath", dest="outputPath",
40 - help="Path to place output files", metavar="PATH")
41 -
42 - (options, args) = parser.parse_args()
43 - if len(args) > 0:
44 - parser.error("None parameter entered.")
45 - sys.exit(1)
46 -
47 - # Printing parameter values
48 - print('-------------------------------- PARAMETERS --------------------------------')
49 - print("Path to read input files: " + str(options.inputPath))
50 - print("Path to place output files: " + str(options.outputPath))
51 -
52 - hashGcs = {}
53 - regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<')
54 - regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$')
55 - regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$')
56 - # Tags from esquema-gcs.xsd at 11/09/2018
57 - tags = ["Name", "Anti", "Orgn", "Strain", "Substrain", "Gtype", "Gversion", "Med", "Technique", "Supp", "Air", "Temp", "pH", "Press", "OD", "Phase", "Rate", "Vess", "Agit", ]
58 - processed_files = 0
59 - saved_files = 0
60 - complete_report = []
61 - # Walk directory to read files
62 - for path, dirs, files in os.walk(options.inputPath):
63 - for f in files:
64 - if f.endswith("_family.xml"):
65 - print("Processing...{} {}".format(options.inputPath, f))
66 - #with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
67 - with open(os.path.join(options.inputPath, f), "r", errors='replace') as iFile:
68 - # numline = 0
69 - for line in iFile:
70 - # numline+=1
71 - # if f.find("GSE41195") > -1:
72 - # print(numline)
73 - line = line.strip('\n')
74 - result = regexSerie.match(line)
75 - if result:
76 - serie = result.group('serie')
77 - if serie in hashGcs:
78 - print("WARNING! duplicate serie")
79 - else:
80 - hashGcs[serie] = {}
81 - continue
82 - result = regexSample.match(line)
83 - if result:
84 - sample = result.group('sample')
85 - if sample in hashGcs[serie]:
86 - print("WARNING! duplicate sample")
87 - else:
88 - hashGcs[serie][sample] = {}
89 - # hashGcs[serie] = hashSample
90 - #prevSample = sample
91 - continue
92 - result = regexTagContent.finditer(line)
93 - for m in result:
94 - tag = m.group('tag')
95 - content = m.group('content')
96 - content = content.strip()
97 - content = content.replace("&amp;", "&")
98 - content = content.replace("&lt;", "<")
99 - content = content.replace("&gt;", ">")
100 - content = content.replace("&quot;", "\"")
101 - content = content.replace("&apos;", "\'")
102 - #print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
103 - if tag in hashGcs[serie][sample]:
104 - if content in hashGcs[serie][sample][tag]:
105 - #print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace')))
106 - pass # GC content already in hash
107 - else:
108 - # print("New content: {}".format(content))
109 - hashGcs[serie][sample][tag].append(content)
110 - # print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag]))
111 - else:
112 - hashGcs[serie][sample][tag] = [content]
113 - #print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace')))
114 - # print(hashGcs)
115 - processed_files+=1
116 - #with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile:
117 - with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w") as oFile:
118 - output = '"Serie","Sample",'
119 - for tag in tags:
120 - output = output + '"' + tag + '",'
121 - output = output.rstrip(',')
122 - oFile.write(output + "\n")
123 - complete_report.append(output)
124 - for serie, hashSample in hashGcs.items():
125 - print("Serie: {}".format(serie))
126 - for sample, hashTag in sorted(hashSample.items()):
127 - print("\tSample: {}".format(sample))
128 - pTags = []
129 - for tag in tags:
130 - if tag in hashTag:
131 - pTags.append(', '.join(hashTag[tag]))
132 - else:
133 - pTags.append('')
134 -
135 - output = '"{}","{}",'.format(serie, sample)
136 - for tag in pTags:
137 - output = output + '"' + tag + '",'
138 - output = output.rstrip(',')
139 - oFile.write(output + "\n")
140 - complete_report.append(output)
141 - # oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp))
142 - # for tag, listContent in sorted(hashTag.items()):
143 - # print("\t\tTag: {}".format(tag))
144 - # for content in sorted(listContent):
145 - # print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace')))
146 - # # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
147 - # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content))
148 - saved_files+=1
149 -
150 - with open(os.path.join(options.outputPath, "GSE_family.complete-report.csv"), "w") as oFile:
151 - for line in complete_report:
152 - oFile.write(line + "\n")
153 -
154 - print("Processed files: {}".format(processed_files))
155 - print("Saved files: {}".format(saved_files))
156 -
157 -
1 -# -*- coding: UTF-8 -*-
2 -import os
3 -import sys
4 -import argparse
5 -import re
6 -import numpy as np
7 -from datetime import *
8 -__author__ = 'KevinML'
9 -
10 -# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo.
11 -
12 -# Parameters:
13 -# 1) --inputPath input path
14 -# 2) --outputPath output path
15 -
16 -# Ouput:
17 -# 1)
18 -
19 -# Execution:
20 -#Example 1
21 -#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
22 -#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
23 -
24 -#Example 2
25 -#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py
26 -#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
27 -#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
28 -
29 -###########################################################
30 -# MAIN PROGRAM #
31 -###########################################################
32 -
33 -parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>',
34 - epilog= 'Bien Hecho!')
35 -parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True,
36 - help='Ingrese el archivo de entrada.')
37 -parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True,
38 - help='Ingrese el archivo de salida.')
39 -
40 -args = parser.parse_args()
41 -
42 -#if len(args) != 2:
43 -# parser.error("Se introdujeron mas o menos de 2 parametros.")
44 -# sys.exit(1)
45 -
46 -# Printing parameter values
47 -print('-------------------------------- PARAMETERS --------------------------------')
48 -print("Path to read input files: " + str(args.inputPath))
49 -print("Path to place output files: " + str(args.outputPath))
50 -
51 -#ModificCIO TEMPORAL
52 -
53 -archivo = {}
54 -regexTag = re.compile(r'<[A-Za-z]+>')
55 -exit_file = r"exit_file.xml"
56 -
57 -with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile:
58 - oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today()))
59 -
60 -for path, dirs, files in os.walk(args.inputPath):
61 - for f in files:
62 - metadatos = {}
63 - with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile:
64 - for line in iFile:
65 - line = line.strip('\n')
66 - if regexTag.search(line):
67 - renglon = line.split(" = ")
68 - if renglon[0] in metadatos:
69 - metadatos[renglon[0]].append(renglon[1])
70 - else:
71 - metadatos[renglon[0]] = [renglon[1]]
72 -
73 - archivo[f] = metadatos
74 -
75 - with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile:
76 - #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido')
77 - for arch in sorted(archivo):
78 - for k,v in sorted(metadatos.items()):
79 - for x in v:
80 - oFile.write('{}\t{}\t{}\n'.format(arch, k, x))
1 -# -*- coding: UTF-8 -*-
2 -
3 -from optparse import OptionParser
4 -import os
5 -import sys
6 -import gzip
7 -import shutil
8 -
9 -__author__ = 'CMendezC'
10 -
11 -# Objective: uncompress gzip soft file to text soft file
12 -
13 -# Parameters:
14 -# 1) --inputPath input path
15 -# 2) --outputPath output path
16 -
17 -# Ouput:
18 -# 1) Text soft file
19 -
20 -# Execution:
21 -# python gzip-2-soft.py
22 -# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data
23 -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
24 -# python gzip-2-soft.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
25 -
26 -###########################################################
27 -# MAIN PROGRAM #
28 -###########################################################
29 -
30 -if __name__ == "__main__":
31 - # Parameter definition
32 - parser = OptionParser()
33 - parser.add_option("--inputPath", dest="inputPath",
34 - help="Path to read input files", metavar="PATH")
35 - parser.add_option("--outputPath", dest="outputPath",
36 - help="Path to place output files", metavar="PATH")
37 -
38 - (options, args) = parser.parse_args()
39 - if len(args) > 0:
40 - parser.error("None parameter entered.")
41 - sys.exit(1)
42 -
43 - # Printing parameter values
44 - print('-------------------------------- PARAMETERS --------------------------------')
45 - print("Path to read input files: " + str(options.inputPath))
46 - print("Path to place output files: " + str(options.outputPath))
47 -
48 - # Walk directory to read files
49 - for path, dirs, files in os.walk(options.inputPath):
50 - for f in files:
51 - if f.endswith(".gz"):
52 - print("Processing...{}/{}".format(options.inputPath, f))
53 - try:
54 - with gzip.open(os.path.join(options.inputPath, f), 'rb') as f_in:
55 - with open(os.path.join(options.outputPath, f.replace('.soft.gz', '.txt')), 'wb') as f_out:
56 - shutil.copyfileobj(f_in, f_out)
57 - except:
58 - pass
This diff is collapsed. Click to expand it.
1 -# -*- coding: UTF-8 -*-
2 -
3 -from optparse import OptionParser
4 -import os
5 -import sys
6 -
7 -__author__ = 'CMendezC'
8 -
9 -# Objective: convert soft file to XML file:
10 -# include headings, tags, substitute & and <
11 -
12 -# Parameters:
13 -# 1) --inputPath input path
14 -# 2) --outputPath output path
15 -
16 -# Ouput:
17 -# 1) XML File with soft file content
18 -
19 -# Execution:
20 -# python soft-2-xml.py
21 -# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data-additional
22 -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
23 -# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
24 -# Additional files
25 -# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data-additional --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
26 -
27 -###########################################################
28 -# MAIN PROGRAM #
29 -###########################################################
30 -
31 -if __name__ == "__main__":
32 - # Parameter definition
33 - parser = OptionParser()
34 - parser.add_option("--inputPath", dest="inputPath",
35 - help="Path to read input files", metavar="PATH")
36 - parser.add_option("--outputPath", dest="outputPath",
37 - help="Path to place output files", metavar="PATH")
38 -
39 - (options, args) = parser.parse_args()
40 - if len(args) > 0:
41 - parser.error("None parameter entered.")
42 - sys.exit(1)
43 -
44 - # Printing parameter values
45 - print('-------------------------------- PARAMETERS --------------------------------')
46 - print("Path to read input files: " + str(options.inputPath))
47 - print("Path to place output files: " + str(options.outputPath))
48 -
49 - # Walk directory to read files
50 - processedFiles = 0
51 - for path, dirs, files in os.walk(options.inputPath):
52 - for f in files:
53 - if f.endswith("_family.soft"):
54 - print("Processing...{}/{}".format(options.inputPath, f))
55 - softText = ''
56 - with open(os.path.join(options.inputPath, f), "r", encoding="utf-8", errors="replace") as iFile:
57 - with open(os.path.join(options.outputPath, f.replace(".soft", ".xml")), "w",
58 - encoding="utf-8") as oFile:
59 - oFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n<gse xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\nxsi:noNamespaceSchemaLocation=\"esquema-gcs.xsd\">\n\n")
60 - for line in iFile:
61 - line = line.replace("&", "&amp;")
62 - line = line.replace("<", "&lt;")
63 - # line = line.replace(">", "&gt;")
64 - # line = line.replace("\"", "&quot;")
65 - # line = line.replace("\'", "&apos;")
66 - oFile.write(line)
67 - oFile.write("\n</gse>\n")
68 - processedFiles+=1
69 - print("Processed files: {}".format(processedFiles))