Showing
53 changed files
with
0 additions
and
364 deletions
No preview for this file type
GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/bin/get-raw-sentences.sh
→
CoreNLP/bin/get-raw-sentences.sh
File moved
data-sets/file_output/exit_file.txt
deleted
100644 → 0
This diff could not be displayed because it is too large.
data-sets/file_output/exit_file.xml
deleted
100644 → 0
This diff could not be displayed because it is too large.
data-sets/gzip-data/borrame.txt
deleted
100644 → 0
File mode changed
1 | -# -*- coding: UTF-8 -*- | ||
2 | - | ||
3 | -from optparse import OptionParser | ||
4 | -import os | ||
5 | -import sys | ||
6 | -import re | ||
7 | - | ||
8 | -__author__ = 'CMendezC' | ||
9 | - | ||
10 | -# Objective: extract manually tagged growth conditions. | ||
11 | - | ||
12 | -# Parameters: | ||
13 | -# 1) --inputPath input path | ||
14 | -# 2) --outputPath output path | ||
15 | - | ||
16 | -# Ouput: | ||
17 | -# 1) Tab separated file | ||
18 | - | ||
19 | -# Execution: | ||
20 | -# python extract-manually-tagged-gcs.py | ||
21 | -# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data | ||
22 | -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs | ||
23 | -# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs | ||
24 | - | ||
25 | -# python extract-manually-tagged-gcs.py | ||
26 | -# --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs" | ||
27 | -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs | ||
28 | -# python extract-manually-tagged-gcs.py --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs" --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs | ||
29 | - | ||
30 | -########################################################### | ||
31 | -# MAIN PROGRAM # | ||
32 | -########################################################### | ||
33 | - | ||
34 | -if __name__ == "__main__": | ||
35 | - # Parameter definition | ||
36 | - parser = OptionParser() | ||
37 | - parser.add_option("--inputPath", dest="inputPath", | ||
38 | - help="Path to read input files", metavar="PATH") | ||
39 | - parser.add_option("--outputPath", dest="outputPath", | ||
40 | - help="Path to place output files", metavar="PATH") | ||
41 | - | ||
42 | - (options, args) = parser.parse_args() | ||
43 | - if len(args) > 0: | ||
44 | - parser.error("None parameter entered.") | ||
45 | - sys.exit(1) | ||
46 | - | ||
47 | - # Printing parameter values | ||
48 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
49 | - print("Path to read input files: " + str(options.inputPath)) | ||
50 | - print("Path to place output files: " + str(options.outputPath)) | ||
51 | - | ||
52 | - hashGcs = {} | ||
53 | - regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<') | ||
54 | - regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$') | ||
55 | - regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$') | ||
56 | - # Tags from esquema-gcs.xsd at 11/09/2018 | ||
57 | - tags = ["Name", "Anti", "Orgn", "Strain", "Substrain", "Gtype", "Gversion", "Med", "Technique", "Supp", "Air", "Temp", "pH", "Press", "OD", "Phase", "Rate", "Vess", "Agit", ] | ||
58 | - processed_files = 0 | ||
59 | - saved_files = 0 | ||
60 | - complete_report = [] | ||
61 | - # Walk directory to read files | ||
62 | - for path, dirs, files in os.walk(options.inputPath): | ||
63 | - for f in files: | ||
64 | - if f.endswith("_family.xml"): | ||
65 | - print("Processing...{} {}".format(options.inputPath, f)) | ||
66 | - #with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile: | ||
67 | - with open(os.path.join(options.inputPath, f), "r", errors='replace') as iFile: | ||
68 | - # numline = 0 | ||
69 | - for line in iFile: | ||
70 | - # numline+=1 | ||
71 | - # if f.find("GSE41195") > -1: | ||
72 | - # print(numline) | ||
73 | - line = line.strip('\n') | ||
74 | - result = regexSerie.match(line) | ||
75 | - if result: | ||
76 | - serie = result.group('serie') | ||
77 | - if serie in hashGcs: | ||
78 | - print("WARNING! duplicate serie") | ||
79 | - else: | ||
80 | - hashGcs[serie] = {} | ||
81 | - continue | ||
82 | - result = regexSample.match(line) | ||
83 | - if result: | ||
84 | - sample = result.group('sample') | ||
85 | - if sample in hashGcs[serie]: | ||
86 | - print("WARNING! duplicate sample") | ||
87 | - else: | ||
88 | - hashGcs[serie][sample] = {} | ||
89 | - # hashGcs[serie] = hashSample | ||
90 | - #prevSample = sample | ||
91 | - continue | ||
92 | - result = regexTagContent.finditer(line) | ||
93 | - for m in result: | ||
94 | - tag = m.group('tag') | ||
95 | - content = m.group('content') | ||
96 | - content = content.strip() | ||
97 | - content = content.replace("&", "&") | ||
98 | - content = content.replace("<", "<") | ||
99 | - content = content.replace(">", ">") | ||
100 | - content = content.replace(""", "\"") | ||
101 | - content = content.replace("'", "\'") | ||
102 | - #print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace'))) | ||
103 | - if tag in hashGcs[serie][sample]: | ||
104 | - if content in hashGcs[serie][sample][tag]: | ||
105 | - #print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace'))) | ||
106 | - pass # GC content already in hash | ||
107 | - else: | ||
108 | - # print("New content: {}".format(content)) | ||
109 | - hashGcs[serie][sample][tag].append(content) | ||
110 | - # print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag])) | ||
111 | - else: | ||
112 | - hashGcs[serie][sample][tag] = [content] | ||
113 | - #print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace'))) | ||
114 | - # print(hashGcs) | ||
115 | - processed_files+=1 | ||
116 | - #with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile: | ||
117 | - with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w") as oFile: | ||
118 | - output = '"Serie","Sample",' | ||
119 | - for tag in tags: | ||
120 | - output = output + '"' + tag + '",' | ||
121 | - output = output.rstrip(',') | ||
122 | - oFile.write(output + "\n") | ||
123 | - complete_report.append(output) | ||
124 | - for serie, hashSample in hashGcs.items(): | ||
125 | - print("Serie: {}".format(serie)) | ||
126 | - for sample, hashTag in sorted(hashSample.items()): | ||
127 | - print("\tSample: {}".format(sample)) | ||
128 | - pTags = [] | ||
129 | - for tag in tags: | ||
130 | - if tag in hashTag: | ||
131 | - pTags.append(', '.join(hashTag[tag])) | ||
132 | - else: | ||
133 | - pTags.append('') | ||
134 | - | ||
135 | - output = '"{}","{}",'.format(serie, sample) | ||
136 | - for tag in pTags: | ||
137 | - output = output + '"' + tag + '",' | ||
138 | - output = output.rstrip(',') | ||
139 | - oFile.write(output + "\n") | ||
140 | - complete_report.append(output) | ||
141 | - # oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp)) | ||
142 | - # for tag, listContent in sorted(hashTag.items()): | ||
143 | - # print("\t\tTag: {}".format(tag)) | ||
144 | - # for content in sorted(listContent): | ||
145 | - # print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace'))) | ||
146 | - # # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace'))) | ||
147 | - # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content)) | ||
148 | - saved_files+=1 | ||
149 | - | ||
150 | - with open(os.path.join(options.outputPath, "GSE_family.complete-report.csv"), "w") as oFile: | ||
151 | - for line in complete_report: | ||
152 | - oFile.write(line + "\n") | ||
153 | - | ||
154 | - print("Processed files: {}".format(processed_files)) | ||
155 | - print("Saved files: {}".format(saved_files)) | ||
156 | - | ||
157 | - |
data-sets/scripts/file_output.py
deleted
100644 → 0
1 | -# -*- coding: UTF-8 -*- | ||
2 | -import os | ||
3 | -import sys | ||
4 | -import argparse | ||
5 | -import re | ||
6 | -import numpy as np | ||
7 | -from datetime import * | ||
8 | -__author__ = 'KevinML' | ||
9 | - | ||
10 | -# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo. | ||
11 | - | ||
12 | -# Parameters: | ||
13 | -# 1) --inputPath input path | ||
14 | -# 2) --outputPath output path | ||
15 | - | ||
16 | -# Ouput: | ||
17 | -# 1) | ||
18 | - | ||
19 | -# Execution: | ||
20 | -#Example 1 | ||
21 | -#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ | ||
22 | -#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/ | ||
23 | - | ||
24 | -#Example 2 | ||
25 | -#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py | ||
26 | -#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ | ||
27 | -#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/ | ||
28 | - | ||
29 | -########################################################### | ||
30 | -# MAIN PROGRAM # | ||
31 | -########################################################### | ||
32 | - | ||
33 | -parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>', | ||
34 | - epilog= 'Bien Hecho!') | ||
35 | -parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True, | ||
36 | - help='Ingrese el archivo de entrada.') | ||
37 | -parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True, | ||
38 | - help='Ingrese el archivo de salida.') | ||
39 | - | ||
40 | -args = parser.parse_args() | ||
41 | - | ||
42 | -#if len(args) != 2: | ||
43 | -# parser.error("Se introdujeron mas o menos de 2 parametros.") | ||
44 | -# sys.exit(1) | ||
45 | - | ||
46 | -# Printing parameter values | ||
47 | -print('-------------------------------- PARAMETERS --------------------------------') | ||
48 | -print("Path to read input files: " + str(args.inputPath)) | ||
49 | -print("Path to place output files: " + str(args.outputPath)) | ||
50 | - | ||
51 | -#ModificCIO TEMPORAL | ||
52 | - | ||
53 | -archivo = {} | ||
54 | -regexTag = re.compile(r'<[A-Za-z]+>') | ||
55 | -exit_file = r"exit_file.xml" | ||
56 | - | ||
57 | -with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile: | ||
58 | - oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today())) | ||
59 | - | ||
60 | -for path, dirs, files in os.walk(args.inputPath): | ||
61 | - for f in files: | ||
62 | - metadatos = {} | ||
63 | - with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile: | ||
64 | - for line in iFile: | ||
65 | - line = line.strip('\n') | ||
66 | - if regexTag.search(line): | ||
67 | - renglon = line.split(" = ") | ||
68 | - if renglon[0] in metadatos: | ||
69 | - metadatos[renglon[0]].append(renglon[1]) | ||
70 | - else: | ||
71 | - metadatos[renglon[0]] = [renglon[1]] | ||
72 | - | ||
73 | - archivo[f] = metadatos | ||
74 | - | ||
75 | - with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile: | ||
76 | - #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido') | ||
77 | - for arch in sorted(archivo): | ||
78 | - for k,v in sorted(metadatos.items()): | ||
79 | - for x in v: | ||
80 | - oFile.write('{}\t{}\t{}\n'.format(arch, k, x)) |
data-sets/scripts/gzip-2-soft.py
deleted
100644 → 0
1 | -# -*- coding: UTF-8 -*- | ||
2 | - | ||
3 | -from optparse import OptionParser | ||
4 | -import os | ||
5 | -import sys | ||
6 | -import gzip | ||
7 | -import shutil | ||
8 | - | ||
9 | -__author__ = 'CMendezC' | ||
10 | - | ||
11 | -# Objective: uncompress gzip soft file to text soft file | ||
12 | - | ||
13 | -# Parameters: | ||
14 | -# 1) --inputPath input path | ||
15 | -# 2) --outputPath output path | ||
16 | - | ||
17 | -# Ouput: | ||
18 | -# 1) Text soft file | ||
19 | - | ||
20 | -# Execution: | ||
21 | -# python gzip-2-soft.py | ||
22 | -# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data | ||
23 | -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data | ||
24 | -# python gzip-2-soft.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data | ||
25 | - | ||
26 | -########################################################### | ||
27 | -# MAIN PROGRAM # | ||
28 | -########################################################### | ||
29 | - | ||
30 | -if __name__ == "__main__": | ||
31 | - # Parameter definition | ||
32 | - parser = OptionParser() | ||
33 | - parser.add_option("--inputPath", dest="inputPath", | ||
34 | - help="Path to read input files", metavar="PATH") | ||
35 | - parser.add_option("--outputPath", dest="outputPath", | ||
36 | - help="Path to place output files", metavar="PATH") | ||
37 | - | ||
38 | - (options, args) = parser.parse_args() | ||
39 | - if len(args) > 0: | ||
40 | - parser.error("None parameter entered.") | ||
41 | - sys.exit(1) | ||
42 | - | ||
43 | - # Printing parameter values | ||
44 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
45 | - print("Path to read input files: " + str(options.inputPath)) | ||
46 | - print("Path to place output files: " + str(options.outputPath)) | ||
47 | - | ||
48 | - # Walk directory to read files | ||
49 | - for path, dirs, files in os.walk(options.inputPath): | ||
50 | - for f in files: | ||
51 | - if f.endswith(".gz"): | ||
52 | - print("Processing...{}/{}".format(options.inputPath, f)) | ||
53 | - try: | ||
54 | - with gzip.open(os.path.join(options.inputPath, f), 'rb') as f_in: | ||
55 | - with open(os.path.join(options.outputPath, f.replace('.soft.gz', '.txt')), 'wb') as f_out: | ||
56 | - shutil.copyfileobj(f_in, f_out) | ||
57 | - except: | ||
58 | - pass |
data-sets/scripts/output.txt
deleted
100644 → 0
This diff is collapsed. Click to expand it.
data-sets/scripts/soft-2-xml.py
deleted
100644 → 0
1 | -# -*- coding: UTF-8 -*- | ||
2 | - | ||
3 | -from optparse import OptionParser | ||
4 | -import os | ||
5 | -import sys | ||
6 | - | ||
7 | -__author__ = 'CMendezC' | ||
8 | - | ||
9 | -# Objective: convert soft file to XML file: | ||
10 | -# include headings, tags, substitute & and < | ||
11 | - | ||
12 | -# Parameters: | ||
13 | -# 1) --inputPath input path | ||
14 | -# 2) --outputPath output path | ||
15 | - | ||
16 | -# Ouput: | ||
17 | -# 1) XML File with soft file content | ||
18 | - | ||
19 | -# Execution: | ||
20 | -# python soft-2-xml.py | ||
21 | -# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data-additional | ||
22 | -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data | ||
23 | -# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data | ||
24 | -# Additional files | ||
25 | -# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data-additional --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data | ||
26 | - | ||
27 | -########################################################### | ||
28 | -# MAIN PROGRAM # | ||
29 | -########################################################### | ||
30 | - | ||
31 | -if __name__ == "__main__": | ||
32 | - # Parameter definition | ||
33 | - parser = OptionParser() | ||
34 | - parser.add_option("--inputPath", dest="inputPath", | ||
35 | - help="Path to read input files", metavar="PATH") | ||
36 | - parser.add_option("--outputPath", dest="outputPath", | ||
37 | - help="Path to place output files", metavar="PATH") | ||
38 | - | ||
39 | - (options, args) = parser.parse_args() | ||
40 | - if len(args) > 0: | ||
41 | - parser.error("None parameter entered.") | ||
42 | - sys.exit(1) | ||
43 | - | ||
44 | - # Printing parameter values | ||
45 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
46 | - print("Path to read input files: " + str(options.inputPath)) | ||
47 | - print("Path to place output files: " + str(options.outputPath)) | ||
48 | - | ||
49 | - # Walk directory to read files | ||
50 | - processedFiles = 0 | ||
51 | - for path, dirs, files in os.walk(options.inputPath): | ||
52 | - for f in files: | ||
53 | - if f.endswith("_family.soft"): | ||
54 | - print("Processing...{}/{}".format(options.inputPath, f)) | ||
55 | - softText = '' | ||
56 | - with open(os.path.join(options.inputPath, f), "r", encoding="utf-8", errors="replace") as iFile: | ||
57 | - with open(os.path.join(options.outputPath, f.replace(".soft", ".xml")), "w", | ||
58 | - encoding="utf-8") as oFile: | ||
59 | - oFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n<gse xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\nxsi:noNamespaceSchemaLocation=\"esquema-gcs.xsd\">\n\n") | ||
60 | - for line in iFile: | ||
61 | - line = line.replace("&", "&") | ||
62 | - line = line.replace("<", "<") | ||
63 | - # line = line.replace(">", ">") | ||
64 | - # line = line.replace("\"", """) | ||
65 | - # line = line.replace("\'", "'") | ||
66 | - oFile.write(line) | ||
67 | - oFile.write("\n</gse>\n") | ||
68 | - processedFiles+=1 | ||
69 | - print("Processed files: {}".format(processedFiles)) |
File mode changed
data-sets/soft-data/borrame.txt.txt
deleted
100644 → 0
File mode changed
data-sets/xml-data/borrame.txt
deleted
100644 → 0
File mode changed
-
Please register or login to post a comment