parsed_sentences_from_labels_v3.py
1.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/bin/python2.7
out_labels = {
'</Air>': 'O',
'</Gtype>': 'O',
'</Gversion>': 'O',
'</Med>': 'O',
'</Orgn>': 'O',
'</Phase>': 'O',
'</Sample>': 'O',
'</Serie>': 'O',
'</Strain>': 'O',
'</Substrain>': 'O',
'</Supp>': 'O',
'</Technique>': 'O',
'</Temp>': 'O',
'</Name>': 'O',
'</OD>': 'O',
'</Anti>': 'O',
'</Agit>': 'O',
'</Vess>': 'O'}
in_labels = {
'<Air>': 'Air',
'<Gtype>': 'Gtype',
'<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Orgn>': 'Orgn',
'<Phase>': 'Phase',
'<Sample>': 'Sample',
'<Serie>': 'Serie',
'<Strain>': 'Strain',
'<Substrain>': 'Substrain',
'<Supp>': 'Supp',
'<Technique>': 'Technique',
'<Temp>': 'Temp',
'<Name>': 'Name',
'<OD>': 'OD',
'<Anti>': 'Anti',
'<Agit>': 'Agit',
'<Vess>': 'Vess'}
import re
#columna Contenido de "/home/egaytan/Dropbox/PGC/data-sets/file_output/exit_file.txt"
inpath = '/home/egaytan/Dropbox/PGC/data-sets_1/content_colum_data_set.tsv.conll'
outpath = '/home/egaytan/Dropbox/PGC/data-sets_1/sentences_labeled_v1.tsv'
flag = 'O'
with open(outpath, 'w') as out:
with open(inpath, 'r') as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w in in_labels or w in out_labels:
if w in in_labels.keys(): flag = in_labels[w]
if w in out_labels: flag = out_labels[w]
else:
if w == "PGCGROWTHCONDITIONS": out.write('\n')
else:
out.write('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
#print('\t'.join(line.split('\t')[1:4])+'\t'+flag)