Carlos-Francisco Méndez-Cruz

GCs GEO extraction

This diff is collapsed. Click to expand it.
#!/bin/python2.7
out_labels = {
'</Air>': 'O',
'</Gtype>': 'O',
'</Gversion>': 'O',
'</Med>': 'O',
'</Orgn>': 'O',
'</Phase>': 'O',
'</Sample>': 'O',
'</Serie>': 'O',
'</Strain>': 'O',
'</Substrain>': 'O',
'</Supp>': 'O',
'</Technique>': 'O',
'</Temp>': 'O',
'</Name>': 'O',
'</OD>': 'O',
'</Anti>': 'O',
'</Agit>': 'O',
'</Vess>': 'O'}
in_labels = {
'<Air>': 'Air',
'<Gtype>': 'Gtype',
'<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Orgn>': 'Orgn',
'<Phase>': 'Phase',
'<Sample>': 'Sample',
'<Serie>': 'Serie',
'<Strain>': 'Strain',
'<Substrain>': 'Substrain',
'<Supp>': 'Supp',
'<Technique>': 'Technique',
'<Temp>': 'Temp',
'<Name>': 'Name',
'<OD>': 'OD',
'<Anti>': 'Anti',
'<Agit>': 'Agit',
'<Vess>': 'Vess'}
import re
#columna Contenido de "/home/egaytan/Dropbox/PGC/data-sets/file_output/exit_file.txt"
inpath = '/home/egaytan/Dropbox/PGC/data-sets_1/content_colum_data_set.tsv.conll'
outpath = '/home/egaytan/Dropbox/PGC/data-sets_1/sentences_labeled_v1.tsv'
flag = 'O'
with open(outpath, 'w') as out:
with open(inpath, 'r') as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w in in_labels or w in out_labels:
if w in in_labels.keys(): flag = in_labels[w]
if w in out_labels: flag = out_labels[w]
else:
if w == "PGCGROWTHCONDITIONS": out.write('\n')
else:
out.write('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
#print('\t'.join(line.split('\t')[1:4])+'\t'+flag)
\ No newline at end of file
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.