Showing
4 changed files
with
808 additions
and
0 deletions
1 | +import stanza | ||
2 | +import argparse | ||
3 | +import re | ||
4 | +import os | ||
5 | +import pandas as pd | ||
6 | + | ||
7 | +# Objective | ||
8 | +# Sentences extraction from XML Soft files. | ||
9 | +# | ||
10 | +# Input parameters | ||
11 | +# --inputPath=PATH Path to XML Soft files | ||
12 | +# --outputPath=PATH Path to place output files | ||
13 | +# | ||
14 | +# Output | ||
15 | +# Files with sentences obtained from XML Soft files | ||
16 | +# | ||
17 | +# Examples | ||
18 | +# python extract-sentences-from-softfiles.py | ||
19 | +# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data | ||
20 | +# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | ||
21 | +# | ||
22 | +# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | ||
23 | + | ||
24 | +########################################## | ||
25 | +# MAIN PROGRAM # | ||
26 | +########################################## | ||
27 | + | ||
28 | +if __name__ == "__main__": | ||
29 | + # Defining parameters | ||
30 | + parser = argparse.ArgumentParser( | ||
31 | + prog='extract-sentences-from-softfiles', | ||
32 | + description='Sentences extraction from XML Soft files.', | ||
33 | + epilog='') | ||
34 | + parser.add_argument("--inputPath", dest="inputPath", | ||
35 | + help="Path to XML Soft files", metavar="PATH") | ||
36 | + parser.add_argument("--outputPath", dest="outputPath", | ||
37 | + help="Path for output files", metavar="PATH") | ||
38 | + | ||
39 | + args = parser.parse_args() | ||
40 | + | ||
41 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
42 | + print("Path to XML Soft files: " + args.inputPath) | ||
43 | + print("Path to output files: " + args.outputPath) | ||
44 | + print('-------------------------------- PROCESSING --------------------------------') | ||
45 | + | ||
46 | + ## Tags of GCs into consideration | ||
47 | + # culture medium, medium supplements, aeration, temperature, | ||
48 | + # pH, agitation, growth phase, optical density, genetic background | ||
49 | + tags = { | ||
50 | + '<Gtype>': 'Gtype', | ||
51 | + # '<Gversion>': 'Gversion', | ||
52 | + '<Med>': 'Med', | ||
53 | + '<Phase>': 'Phase', | ||
54 | + # '<Substrain>': 'Substrain', | ||
55 | + '<Supp>': 'Supp', | ||
56 | + # '<Strain>': 'Strain', | ||
57 | + # '<Technique>': 'Technique', | ||
58 | + '<Temp>': 'Temp', | ||
59 | + '<OD>': 'OD', | ||
60 | + '<Anti>': 'Anti', | ||
61 | + '<Agit>': 'Agit', | ||
62 | + '<Air>': 'Air', | ||
63 | + '<Vess>': 'Vess', | ||
64 | + '<pH>': 'pH' | ||
65 | + } | ||
66 | + #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>', | ||
67 | + # '<Temp>', '<OD>', '<Anti>', '<Agit>', | ||
68 | + # '<Air>', '<Vess>', '<pH>'] | ||
69 | + #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>'] | ||
70 | + tags = ['Gtype', 'Med', 'Phase', 'Supp', | ||
71 | + 'Temp', 'OD', 'Anti', 'Agit', | ||
72 | + 'Air', 'Vess', 'pH'] | ||
73 | + deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique'] | ||
74 | + all_tags = tags + deleted_tags | ||
75 | + # Regex to check if line has a tag | ||
76 | + regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>') | ||
77 | + # Regex to delete tags | ||
78 | + regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>') | ||
79 | + # Regex to substitute tags | ||
80 | + regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>') | ||
81 | + regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>') | ||
82 | + #p = re.compile(r'blue (?P<animal>dog|cat)') | ||
83 | + #p.sub(r'gray \g<animal>', s) | ||
84 | + # Regex to tag GCs | ||
85 | + regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))') | ||
86 | + regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))') | ||
87 | + | ||
88 | + # Testing file: GSE54899_family_retagged-05242019_validated.xml | ||
89 | + testing_file = "GSE54899_family_retagged-05242019_validated.xml" | ||
90 | + | ||
91 | + # Define stanza pipeline for sentence segmentation | ||
92 | + nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize') | ||
93 | + # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation | ||
94 | + nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True) | ||
95 | + | ||
96 | + # Store field_name (bangline) and field_text | ||
97 | + field_name = "" | ||
98 | + field_text = "" | ||
99 | + | ||
100 | + # Store list of unique field_name | ||
101 | + hash_field_name = {} | ||
102 | + | ||
103 | + # Store sentences from fields that contained at least one GC tag. | ||
104 | + # We want to use this list for someone to check it | ||
105 | + df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence']) | ||
106 | + | ||
107 | + # Store serie number | ||
108 | + # ^SERIES = GSE54899 | ||
109 | + serie = "" | ||
110 | + # Store series pubmed id | ||
111 | + # !Series_pubmed_id = 25222563 | ||
112 | + serie_pubmed_id = "" | ||
113 | + # Store sample | ||
114 | + # ^SAMPLE = GSM1326335 | ||
115 | + sample = "" | ||
116 | + | ||
117 | + for path, dirs, files in os.walk(args.inputPath): | ||
118 | + # For each file in dir | ||
119 | + for file in files: | ||
120 | + if file == testing_file: | ||
121 | + print(" Reading file..." + str(file)) | ||
122 | + with open(os.path.join(args.inputPath, file)) as iFile: | ||
123 | + for line in iFile: | ||
124 | + line = line.rstrip('\n') | ||
125 | + if line.find(" = ") == -1: | ||
126 | + continue | ||
127 | + list_line = line.split(" = ") | ||
128 | + field_name = list_line[0] | ||
129 | + #print("field_name: {}".format(field_name)) | ||
130 | + field_text = list_line[1] | ||
131 | + #print("field_text: {}".format(field_text)) | ||
132 | + if field_name == "^SERIES": | ||
133 | + serie = field_text | ||
134 | + elif field_name == "!Series_pubmed_id": | ||
135 | + serie_pubmed_id = field_text | ||
136 | + elif field_name == "^SAMPLE": | ||
137 | + sample = field_text | ||
138 | + elif regex_has_tag.search(line): # Contains GC tag | ||
139 | + if field_name in hash_field_name: | ||
140 | + hash_field_name[field_name] += 1 | ||
141 | + else: | ||
142 | + hash_field_name[field_name] = 1 | ||
143 | + original_sentence = field_text | ||
144 | + # delete GC tags | ||
145 | + modified_sentence = regex_delete_tag.sub("", field_text) | ||
146 | + modified_sentence = regex_delete_tag.sub("", modified_sentence) | ||
147 | + # substitute tags | ||
148 | + # p = re.compile(r'blue (?P<animal>dog|cat)') | ||
149 | + # p.sub(r'gray \g<animal>', s) | ||
150 | + modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence) | ||
151 | + modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence) | ||
152 | + doc = nlp(modified_sentence) | ||
153 | + for i, sentence in enumerate(doc.sentences): | ||
154 | + # print(sentence.text) | ||
155 | + list_transformed_sentence = [] | ||
156 | + # For GC tag | ||
157 | + gc_tag = "O" | ||
158 | + in_tag = False | ||
159 | + for word in sentence.words: | ||
160 | + result = regex_gc_ini_tag.match(word.text) | ||
161 | + if result: | ||
162 | + gc_tag = result.group("tag") | ||
163 | + in_tag = True | ||
164 | + continue | ||
165 | + else: | ||
166 | + result = regex_gc_end_tag.match(word.text) | ||
167 | + if result: | ||
168 | + gc_tag = "O" | ||
169 | + in_tag = False | ||
170 | + continue | ||
171 | + else: | ||
172 | + if not in_tag: | ||
173 | + gc_tag = "O" | ||
174 | + list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag)) | ||
175 | + transformed_sentence = " ".join(list_transformed_sentence) | ||
176 | + new_row = {'serie': serie, | ||
177 | + 'serie_pubmed_id': serie_pubmed_id, | ||
178 | + 'sample': sample, | ||
179 | + 'field_name': field_name, | ||
180 | + 'original_sentence': original_sentence, | ||
181 | + 'modified_sentence': sentence.text, | ||
182 | + 'transformed_sentence': transformed_sentence} | ||
183 | + df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True) | ||
184 | + df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv')) | ||
185 | + #print(token) | ||
186 | + quit() | ||
187 | + | ||
188 | + ## End of tagging | ||
189 | + out_labels = { | ||
190 | + '</Gtype>': 'O', | ||
191 | + '</Gversion>': 'O', | ||
192 | + '</Med>': 'O', | ||
193 | + '</Phase>': 'O', | ||
194 | + '</Substrain>': 'O', | ||
195 | + '</Supp>': 'O', | ||
196 | + '</Strain>': 'O', | ||
197 | + '</Technique>': 'O', | ||
198 | + '</Temp>': 'O', | ||
199 | + '</OD>': 'O', | ||
200 | + '</Anti>': 'O', | ||
201 | + '</Agit>': 'O', | ||
202 | + '</Air>': 'O', | ||
203 | + '</Vess>': 'O', | ||
204 | + '</pH>': 'O'} | ||
205 | + old_labels = { | ||
206 | + '<Orgn>': 'O', | ||
207 | + '</Orgn>': 'O' | ||
208 | + } | ||
209 | + | ||
210 | + # Other label | ||
211 | + flag = 'O' | ||
212 | + lista = [] | ||
213 | + # First sentence | ||
214 | + sentence = '' | ||
215 | + n = 0 | ||
216 | + with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file: | ||
217 | + for line in input_file: | ||
218 | + if len(line.split('\t')) > 1: | ||
219 | + w = line.split('\t')[1] | ||
220 | + if w in in_labels or w in out_labels: | ||
221 | + # Tagging | ||
222 | + if w in in_labels.keys(): flag = in_labels[w] | ||
223 | + if w in out_labels: flag = out_labels[w] | ||
224 | + else: | ||
225 | + if w == "PGCGROWTHCONDITIONS": | ||
226 | + n = n + 1 | ||
227 | + words = sentence.split(' ') | ||
228 | + # End of sentence | ||
229 | + tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()] | ||
230 | + # At least one true-tag on sentence | ||
231 | + if len(tags) > 0: | ||
232 | + lista.append(sentence) | ||
233 | + # New setence | ||
234 | + sentence = '' | ||
235 | + elif w not in old_labels.keys(): | ||
236 | + # Building and save tagging sentence | ||
237 | + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ') | ||
238 | + | ||
239 | + print("Number of sentences with at least one tag: " + str(len(lista))) | ||
240 | + print("Number of sentences from CoreNLP: " + str(n)) | ||
241 | + | ||
242 | + # Split 70 30 training and test sentences | ||
243 | + trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70)) | ||
244 | + testIndex = [n for n in range(len(lista)) if n not in trainingIndex] | ||
245 | + print("Number of sentences for training: " + str(len(trainingIndex))) | ||
246 | + print("Number of sentences for test: " + str(len(testIndex))) | ||
247 | + | ||
248 | + with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile: | ||
249 | + Data = [lista[i] for i in trainingIndex] | ||
250 | + oFile.write('\n'.join(Data)) | ||
251 | + | ||
252 | + with open(os.path.join(args.outputPath, args.testFile), "w") as oFile: | ||
253 | + Data = [lista[i] for i in testIndex] | ||
254 | + oFile.write('\n'.join(Data)) | ||
255 | + | ||
256 | + print("==================================END===================================") |
1 | +import stanza | ||
2 | +import argparse | ||
3 | +import re | ||
4 | +import os | ||
5 | +import pandas as pd | ||
6 | + | ||
7 | +# Objective | ||
8 | +# Sentences extraction from XML Soft files. | ||
9 | +# | ||
10 | +# Input parameters | ||
11 | +# --inputPath=PATH Path to XML Soft files | ||
12 | +# --outputPath=PATH Path to place output files | ||
13 | +# | ||
14 | +# Output | ||
15 | +# Files with sentences obtained from XML Soft files | ||
16 | +# | ||
17 | +# Examples | ||
18 | +# python extract-sentences-from-softfiles.py | ||
19 | +# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data | ||
20 | +# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | ||
21 | +# | ||
22 | +# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | ||
23 | + | ||
24 | +########################################## | ||
25 | +# MAIN PROGRAM # | ||
26 | +########################################## | ||
27 | + | ||
28 | +if __name__ == "__main__": | ||
29 | + # Defining parameters | ||
30 | + parser = argparse.ArgumentParser( | ||
31 | + prog='extract-sentences-from-softfiles', | ||
32 | + description='Sentences extraction from XML Soft files.', | ||
33 | + epilog='') | ||
34 | + parser.add_argument("--inputPath", dest="inputPath", | ||
35 | + help="Path to XML Soft files", metavar="PATH") | ||
36 | + parser.add_argument("--outputPath", dest="outputPath", | ||
37 | + help="Path for output files", metavar="PATH") | ||
38 | + | ||
39 | + args = parser.parse_args() | ||
40 | + | ||
41 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
42 | + print("Path to XML Soft files: " + args.inputPath) | ||
43 | + print("Path to output files: " + args.outputPath) | ||
44 | + print('-------------------------------- PROCESSING --------------------------------') | ||
45 | + | ||
46 | + ## Tags of GCs into consideration | ||
47 | + # culture medium, medium supplements, aeration, temperature, | ||
48 | + # pH, agitation, growth phase, optical density, genetic background | ||
49 | + tags = { | ||
50 | + '<Gtype>': 'Gtype', | ||
51 | + # '<Gversion>': 'Gversion', | ||
52 | + '<Med>': 'Med', | ||
53 | + '<Phase>': 'Phase', | ||
54 | + # '<Substrain>': 'Substrain', | ||
55 | + '<Supp>': 'Supp', | ||
56 | + # '<Strain>': 'Strain', | ||
57 | + # '<Technique>': 'Technique', | ||
58 | + '<Temp>': 'Temp', | ||
59 | + '<OD>': 'OD', | ||
60 | + '<Anti>': 'Anti', | ||
61 | + '<Agit>': 'Agit', | ||
62 | + '<Air>': 'Air', | ||
63 | + '<Vess>': 'Vess', | ||
64 | + '<pH>': 'pH' | ||
65 | + } | ||
66 | + #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>', | ||
67 | + # '<Temp>', '<OD>', '<Anti>', '<Agit>', | ||
68 | + # '<Air>', '<Vess>', '<pH>'] | ||
69 | + #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>'] | ||
70 | + tags = ['Gtype', 'Med', 'Phase', 'Supp', | ||
71 | + 'Temp', 'OD', 'Anti', 'Agit', | ||
72 | + 'Air', 'Vess', 'pH'] | ||
73 | + deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique'] | ||
74 | + all_tags = tags + deleted_tags | ||
75 | + # Regex to check if line has a tag | ||
76 | + regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>') | ||
77 | + # Regex to delete tags | ||
78 | + regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>') | ||
79 | + # Regex to substitute tags | ||
80 | + regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>') | ||
81 | + regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>') | ||
82 | + #p = re.compile(r'blue (?P<animal>dog|cat)') | ||
83 | + #p.sub(r'gray \g<animal>', s) | ||
84 | + # Regex to tag GCs | ||
85 | + regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))') | ||
86 | + regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))') | ||
87 | + | ||
88 | + # Testing file: GSE54899_family_retagged-05242019_validated.xml | ||
89 | + testing_file = "GSE54899_family_retagged-05242019_validated.xml" | ||
90 | + | ||
91 | + # Define stanza pipeline for sentence segmentation | ||
92 | + nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize') | ||
93 | + # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation | ||
94 | + nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True) | ||
95 | + | ||
96 | + # Store field_name (bangline) and field_text | ||
97 | + field_name = "" | ||
98 | + field_text = "" | ||
99 | + | ||
100 | + # Store list of unique field_name | ||
101 | + hash_field_name = {} | ||
102 | + | ||
103 | + # Store sentences from fields that contained at least one GC tag. | ||
104 | + # We want to use this list for someone to check it | ||
105 | + df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence']) | ||
106 | + | ||
107 | + # Store serie number | ||
108 | + # ^SERIES = GSE54899 | ||
109 | + serie = "" | ||
110 | + # Store series pubmed id | ||
111 | + # !Series_pubmed_id = 25222563 | ||
112 | + serie_pubmed_id = "" | ||
113 | + # Store sample | ||
114 | + # ^SAMPLE = GSM1326335 | ||
115 | + sample = "" | ||
116 | + | ||
117 | + for path, dirs, files in os.walk(args.inputPath): | ||
118 | + # For each file in dir | ||
119 | + for file in files: | ||
120 | + if file == testing_file: | ||
121 | + print(" Reading file..." + str(file)) | ||
122 | + with open(os.path.join(args.inputPath, file)) as iFile: | ||
123 | + for line in iFile: | ||
124 | + line = line.rstrip('\n') | ||
125 | + if line.find(" = ") == -1: | ||
126 | + continue | ||
127 | + list_line = line.split(" = ") | ||
128 | + field_name = list_line[0] | ||
129 | + #print("field_name: {}".format(field_name)) | ||
130 | + field_text = list_line[1] | ||
131 | + #print("field_text: {}".format(field_text)) | ||
132 | + if field_name == "^SERIES": | ||
133 | + serie = field_text | ||
134 | + elif field_name == "!Series_pubmed_id": | ||
135 | + serie_pubmed_id = field_text | ||
136 | + elif field_name == "^SAMPLE": | ||
137 | + sample = field_text | ||
138 | + elif regex_has_tag.search(line): # Contains GC tag | ||
139 | + if field_name in hash_field_name: | ||
140 | + hash_field_name[field_name] += 1 | ||
141 | + else: | ||
142 | + hash_field_name[field_name] = 1 | ||
143 | + original_sentence = field_text | ||
144 | + # delete GC tags | ||
145 | + modified_sentence = regex_delete_tag.sub("", field_text) | ||
146 | + modified_sentence = regex_delete_tag.sub("", modified_sentence) | ||
147 | + # substitute tags | ||
148 | + # p = re.compile(r'blue (?P<animal>dog|cat)') | ||
149 | + # p.sub(r'gray \g<animal>', s) | ||
150 | + modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence) | ||
151 | + modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence) | ||
152 | + doc = nlp(modified_sentence) | ||
153 | + for i, sentence in enumerate(doc.sentences): | ||
154 | + # print(sentence.text) | ||
155 | + list_transformed_sentence = [] | ||
156 | + # For GC tag | ||
157 | + gc_tag = "O" | ||
158 | + in_tag = False | ||
159 | + for word in sentence.words: | ||
160 | + result = regex_gc_ini_tag.match(word.text) | ||
161 | + if result: | ||
162 | + gc_tag = result.group("tag") | ||
163 | + in_tag = True | ||
164 | + continue | ||
165 | + else: | ||
166 | + result = regex_gc_end_tag.match(word.text) | ||
167 | + if result: | ||
168 | + gc_tag = "O" | ||
169 | + in_tag = False | ||
170 | + continue | ||
171 | + else: | ||
172 | + if not in_tag: | ||
173 | + gc_tag = "O" | ||
174 | + list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag)) | ||
175 | + transformed_sentence = " ".join(list_transformed_sentence) | ||
176 | + new_row = {'serie': serie, | ||
177 | + 'serie_pubmed_id': serie_pubmed_id, | ||
178 | + 'sample': sample, | ||
179 | + 'field_name': field_name, | ||
180 | + 'original_sentence': original_sentence, | ||
181 | + 'modified_sentence': sentence.text, | ||
182 | + 'transformed_sentence': transformed_sentence} | ||
183 | + df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True) | ||
184 | + df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv')) | ||
185 | + #print(token) | ||
186 | + quit() | ||
187 | + | ||
188 | + ## End of tagging | ||
189 | + out_labels = { | ||
190 | + '</Gtype>': 'O', | ||
191 | + '</Gversion>': 'O', | ||
192 | + '</Med>': 'O', | ||
193 | + '</Phase>': 'O', | ||
194 | + '</Substrain>': 'O', | ||
195 | + '</Supp>': 'O', | ||
196 | + '</Strain>': 'O', | ||
197 | + '</Technique>': 'O', | ||
198 | + '</Temp>': 'O', | ||
199 | + '</OD>': 'O', | ||
200 | + '</Anti>': 'O', | ||
201 | + '</Agit>': 'O', | ||
202 | + '</Air>': 'O', | ||
203 | + '</Vess>': 'O', | ||
204 | + '</pH>': 'O'} | ||
205 | + old_labels = { | ||
206 | + '<Orgn>': 'O', | ||
207 | + '</Orgn>': 'O' | ||
208 | + } | ||
209 | + | ||
210 | + # Other label | ||
211 | + flag = 'O' | ||
212 | + lista = [] | ||
213 | + # First sentence | ||
214 | + sentence = '' | ||
215 | + n = 0 | ||
216 | + with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file: | ||
217 | + for line in input_file: | ||
218 | + if len(line.split('\t')) > 1: | ||
219 | + w = line.split('\t')[1] | ||
220 | + if w in in_labels or w in out_labels: | ||
221 | + # Tagging | ||
222 | + if w in in_labels.keys(): flag = in_labels[w] | ||
223 | + if w in out_labels: flag = out_labels[w] | ||
224 | + else: | ||
225 | + if w == "PGCGROWTHCONDITIONS": | ||
226 | + n = n + 1 | ||
227 | + words = sentence.split(' ') | ||
228 | + # End of sentence | ||
229 | + tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()] | ||
230 | + # At least one true-tag on sentence | ||
231 | + if len(tags) > 0: | ||
232 | + lista.append(sentence) | ||
233 | + # New setence | ||
234 | + sentence = '' | ||
235 | + elif w not in old_labels.keys(): | ||
236 | + # Building and save tagging sentence | ||
237 | + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ') | ||
238 | + | ||
239 | + print("Number of sentences with at least one tag: " + str(len(lista))) | ||
240 | + print("Number of sentences from CoreNLP: " + str(n)) | ||
241 | + | ||
242 | + # Split 70 30 training and test sentences | ||
243 | + trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70)) | ||
244 | + testIndex = [n for n in range(len(lista)) if n not in trainingIndex] | ||
245 | + print("Number of sentences for training: " + str(len(trainingIndex))) | ||
246 | + print("Number of sentences for test: " + str(len(testIndex))) | ||
247 | + | ||
248 | + with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile: | ||
249 | + Data = [lista[i] for i in trainingIndex] | ||
250 | + oFile.write('\n'.join(Data)) | ||
251 | + | ||
252 | + with open(os.path.join(args.outputPath, args.testFile), "w") as oFile: | ||
253 | + Data = [lista[i] for i in testIndex] | ||
254 | + oFile.write('\n'.join(Data)) | ||
255 | + | ||
256 | + print("==================================END===================================") |
extraction-literature/input/README.md
0 → 100644
1 | +# Input article collection | ||
2 | +We used list of PMIDs from article collections delivered by curators (Víctor, Soco, Paloma). | ||
3 | +DRIVE (https://docs.google.com/spreadsheets/d/1OayfQ7ODgnU4d5PQ3SUAmFX3Tc27PocCHZ6flPXwLKc/edit?usp=sharing) | ||
4 | +Asana (https://app.asana.com/0/1200927210854847/1203428992254399/f) | ||
5 | + | ||
6 | +# Download PDFs | ||
7 | +We used [Pubmed-Batch-Download](https://github.com/billgreenwald/Pubmed-Batch-Download) tool to download PDF files. | ||
8 | +## Installation | ||
9 | +```shell | ||
10 | +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ git clone https://github.com/billgreenwald/Pubmed-Batch-Download.git | ||
11 | +Cloning into 'Pubmed-Batch-Download'... | ||
12 | +remote: Enumerating objects: 202, done. | ||
13 | +remote: Counting objects: 100% (12/12), done. | ||
14 | +remote: Compressing objects: 100% (12/12), done. | ||
15 | +remote: Total 202 (delta 5), reused 0 (delta 0), pack-reused 190 | ||
16 | +Receiving objects: 100% (202/202), 31.23 MiB | 1.09 MiB/s, done. | ||
17 | +Resolving deltas: 100% (102/102), done. | ||
18 | +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ mv Pubmed-Batch-Download/ github-Pubmed-Batch-Download | ||
19 | +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ cd github-Pubmed-Batch-Download/ | ||
20 | +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ ls -l | ||
21 | +total 52 | ||
22 | +-rw-rw-r-- 1 cmendezc cmendezc 72 ene 5 11:31 example_pmf.tsv | ||
23 | +-rw-rw-r-- 1 cmendezc cmendezc 11430 ene 5 11:31 fetch_pdfs.py | ||
24 | +-rw-rw-r-- 1 cmendezc cmendezc 18711 ene 5 11:31 fetch_pdfs_toScript.ipynb | ||
25 | +-rw-rw-r-- 1 cmendezc cmendezc 551 ene 5 11:31 pubmed-batch-downloader-py3-windows.yml | ||
26 | +-rw-rw-r-- 1 cmendezc cmendezc 895 ene 5 11:31 pubmed-batch-downloader-py3.yml | ||
27 | +-rw-rw-r-- 1 cmendezc cmendezc 3667 ene 5 11:31 README.md | ||
28 | +drwxrwxr-x 2 cmendezc cmendezc 4096 ene 5 11:31 ruby_version | ||
29 | +-rw-rw-r-- 1 cmendezc cmendezc 0 ene 5 11:31 unfetched_pmids.tsv | ||
30 | +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda env create -f pubmed-batch-downloader-py3.yml | ||
31 | +``` | ||
32 | +## Testing | ||
33 | +Error! | ||
34 | +```shell | ||
35 | +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda activate pubmed-batch-downloader-py3 | ||
36 | +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ python fetch_pdfs.py -pmf example_pmf.tsv | ||
37 | +Traceback (most recent call last): | ||
38 | + File "fetch_pdfs.py", line 64, in <module> | ||
39 | + from bs4 import BeautifulSoup | ||
40 | +ModuleNotFoundError: No module named 'bs4' | ||
41 | +``` | ||
42 | +Fix 1: Install bs4 | ||
43 | +```shell | ||
44 | +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ pip install bs4 | ||
45 | +Collecting bs4 | ||
46 | + Using cached https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz | ||
47 | +Collecting beautifulsoup4 (from bs4) | ||
48 | + Using cached https://files.pythonhosted.org/packages/9c/d8/909c4089dbe4ade9f9705f143c9f13f065049a9d5e7d34c828aefdd0a97c/beautifulsoup4-4.11.1-py3-none-any.whl | ||
49 | +Collecting soupsieve>1.2 (from beautifulsoup4->bs4) | ||
50 | + Using cached https://files.pythonhosted.org/packages/16/e3/4ad79882b92617e3a4a0df1960d6bce08edfb637737ac5c3f3ba29022e25/soupsieve-2.3.2.post1-py3-none-any.whl | ||
51 | +Building wheels for collected packages: bs4 | ||
52 | + Building wheel for bs4 (setup.py) ... done | ||
53 | + Stored in directory: /home/cmendezc/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472 | ||
54 | +Successfully built bs4 | ||
55 | +Installing collected packages: soupsieve, beautifulsoup4, bs4 | ||
56 | +Successfully installed beautifulsoup4-4.11.1 bs4-0.0.1 soupsieve-2.3.2.post1 | ||
57 | +``` | ||
58 | +Error! | ||
59 | +```shell | ||
60 | +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ python fetch_pdfs.py -pmf example_pmf.tsv | ||
61 | +Output directory of fetched_pdfs did not exist. Created the directory. | ||
62 | +Trying to fetch pmid 27547345 | ||
63 | +** fetching of reprint 27547345 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library? | ||
64 | +Trying to fetch pmid 22610656 | ||
65 | +** fetching of reprint 22610656 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library? | ||
66 | +Trying to fetch pmid 23858657 | ||
67 | +** fetching of reprint 23858657 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library? | ||
68 | +Trying to fetch pmid 24998529 | ||
69 | +** fetching of reprint 24998529 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library? | ||
70 | +Trying to fetch pmid 27859194 | ||
71 | +** fetching of reprint 27859194 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library? | ||
72 | +Trying to fetch pmid 26991916 | ||
73 | +** fetching of reprint 26991916 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library? | ||
74 | +Trying to fetch pmid 26742956 | ||
75 | +** fetching of reprint 26742956 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library? | ||
76 | +Trying to fetch pmid 28388874 | ||
77 | +** fetching of reprint 28388874 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library? | ||
78 | +``` | ||
79 | +Fix 2: Install | ||
80 | +```shell | ||
81 | +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ pip install lxml | ||
82 | +Collecting lxml | ||
83 | + Downloading https://files.pythonhosted.org/packages/4b/24/300d0fd5130cf55e5bbab2c53d339728370cb4ac12ca80a4f421c2e228eb/lxml-4.9.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (5.8MB) | ||
84 | + |████████████████████████████████| 5.8MB 2.7MB/s | ||
85 | +Installing collected packages: lxml | ||
86 | +Successfully installed lxml-4.9.2 | ||
87 | +``` | ||
88 | +It runs, but it didn't fetch all files. See unfetch_pmids.tsv | ||
89 | + | ||
90 | +## Run | ||
91 | +```shell | ||
92 | +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda activate pubmed-batch-downloader-py3 | ||
93 | +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ cd /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/extraction-literature/input/pdfs | ||
94 | +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/gitlab-automatic-extraction-growth-conditions/extraction-literature/input/pdfs$ python /home/cmendezc/Documents/ccg/github-Pubmed-Batch-Download/fetch_pdfs.py -pmf ../list_of_PMIDs.txt | ||
95 | +Output directory of fetched_pdfs did not exist. Created the directory. | ||
96 | +Trying to fetch pmid 21097887 | ||
97 | +Trying genericCitationLabelled | ||
98 | +Trying pubmed_central_v2 | ||
99 | +** fetching reprint using the 'pubmed central' finder... | ||
100 | +** fetching of reprint 21097887 succeeded | ||
101 | +Trying to fetch pmid 23818864 | ||
102 | +Trying genericCitationLabelled | ||
103 | +Trying pubmed_central_v2 | ||
104 | +Trying acsPublications | ||
105 | +Trying uchicagoPress | ||
106 | +Trying nejm | ||
107 | +Trying futureMedicine | ||
108 | +Trying science_direct | ||
109 | +** fetching of reprint 23818864 failed from error list index out of range | ||
110 | +Trying to fetch pmid 24947454 | ||
111 | +Trying genericCitationLabelled | ||
112 | +** fetching reprint using the 'generic citation labelled' finder... | ||
113 | +** fetching of reprint 24947454 succeeded | ||
114 | +Trying to fetch pmid 25222563 | ||
115 | +Trying genericCitationLabelled | ||
116 | +** fetching reprint using the 'generic citation labelled' finder... | ||
117 | +** fetching of reprint 25222563 succeeded | ||
118 | +Trying to fetch pmid 25275371 | ||
119 | +Trying genericCitationLabelled | ||
120 | +** fetching reprint using the 'generic citation labelled' finder... | ||
121 | +** fetching of reprint 25275371 succeeded | ||
122 | +Trying to fetch pmid 25735747 | ||
123 | +Trying genericCitationLabelled | ||
124 | +Trying pubmed_central_v2 | ||
125 | +** fetching reprint using the 'pubmed central' finder... | ||
126 | +** fetching of reprint 25735747 succeeded | ||
127 | +Trying to fetch pmid 26258987 | ||
128 | +Trying genericCitationLabelled | ||
129 | +** fetching reprint using the 'generic citation labelled' finder... | ||
130 | +** fetching of reprint 26258987 succeeded | ||
131 | +Trying to fetch pmid 26279566 | ||
132 | +Trying genericCitationLabelled | ||
133 | +Trying pubmed_central_v2 | ||
134 | +Trying acsPublications | ||
135 | +Trying uchicagoPress | ||
136 | +Trying nejm | ||
137 | +Trying futureMedicine | ||
138 | +Trying science_direct | ||
139 | +Trying direct_pdf_link | ||
140 | +** Reprint 26279566 could not be fetched with the current finders. | ||
141 | +Trying to fetch pmid 26670385 | ||
142 | +Trying genericCitationLabelled | ||
143 | +Trying pubmed_central_v2 | ||
144 | +Trying acsPublications | ||
145 | +Trying uchicagoPress | ||
146 | +Trying nejm | ||
147 | +Trying futureMedicine | ||
148 | +Trying science_direct | ||
149 | +** fetching of reprint 26670385 failed from error Invalid URL 'f680gMuZlnwT4304lwsG531xpi4vbk83nDntGZ4l27M-1672941516-0-AdC4bMbcOc9cCSwDp4lsqirHOW3zv1msNUw8lijeZIxdN3BDTBUy983qf_LAfBVEhkt4k2Xwu_NYMJeaq3oG4LTyfDBxN2Ra-cmmTDVIK66GtTB9oyyn4GqMem1PTBwVEFtzEYcB4AOoR8EGbwWrEZa1jPMupBq_gJ0JlxuIGbBJw3SuuioKmRlQT_TDXNREjT2Av3DHrrz6C008shr-pgrTtAoM5aZ0N4clcoBQ1FWX04MZm-nPOxI-2zbxcHUYXqV91lbH7iWkztZWPcv6-Q3ePiFD6_-C7pdY_Mf0Y670kOKyhoqlZ0m3PqPm64-37r-nzxrcd2Z0MWJUMC8Jx1b1OA1e53TJy62F2K5ws3U82zktr4gEDS11A13r8DIn1wRCEH2dk8jI02NQoIp3JBTvUixhiNkWib01Zl7l7iAFLOJtWlVbeUsOwCh6imfV5m-2No7-SiGaur5Ip6Zf3ACDki_CjXifHxtGVh1TbvnYsBeUdoaWV3TsXdGvF7AVr_ytXg4-JiIHhaZ-SdCzpe65bWZmvwrIpCfZOEBOC-gNTm3tq5h1_2iQzVTinGQonsXdwLCYSKQeZRQ-qEFf7y4PpesHamAXmw1OZlZJtKlFgXx9MoCBp0Irx8ChWyIo5RhSBoa9j1_JW8AX1x7KDY3UX32ItW7-a2Qw5IEL_FRS6cyXOg1FLeHlanntIl11kKWmXyJ86bsvEQBn2Q9-1kMvisDZaM0LrfNT9KcghdkLgdpzsDEf-B4_MKdkVAhkBQ': No schema supplied. Perhaps you meant http://f680gMuZlnwT4304lwsG531xpi4vbk83nDntGZ4l27M-1672941516-0-AdC4bMbcOc9cCSwDp4lsqirHOW3zv1msNUw8lijeZIxdN3BDTBUy983qf_LAfBVEhkt4k2Xwu_NYMJeaq3oG4LTyfDBxN2Ra-cmmTDVIK66GtTB9oyyn4GqMem1PTBwVEFtzEYcB4AOoR8EGbwWrEZa1jPMupBq_gJ0JlxuIGbBJw3SuuioKmRlQT_TDXNREjT2Av3DHrrz6C008shr-pgrTtAoM5aZ0N4clcoBQ1FWX04MZm-nPOxI-2zbxcHUYXqV91lbH7iWkztZWPcv6-Q3ePiFD6_-C7pdY_Mf0Y670kOKyhoqlZ0m3PqPm64-37r-nzxrcd2Z0MWJUMC8Jx1b1OA1e53TJy62F2K5ws3U82zktr4gEDS11A13r8DIn1wRCEH2dk8jI02NQoIp3JBTvUixhiNkWib01Zl7l7iAFLOJtWlVbeUsOwCh6imfV5m-2No7-SiGaur5Ip6Zf3ACDki_CjXifHxtGVh1TbvnYsBeUdoaWV3TsXdGvF7AVr_ytXg4-JiIHhaZ-SdCzpe65bWZmvwrIpCfZOEBOC-gNTm3tq5h1_2iQzVTinGQonsXdwLCYSKQeZRQ-qEFf7y4PpesHamAXmw1OZlZJtKlFgXx9MoCBp0Irx8ChWyIo5RhSBoa9j1_JW8AX1x7KDY3UX32ItW7-a2Qw5IEL_FRS6cyXOg1FLeHlanntIl11kKWmXyJ86bsvEQBn2Q9-1kMvisDZaM0LrfNT9KcghdkLgdpzsDEf-B4_MKdkVAhkBQ? | ||
150 | +Trying to fetch pmid 26673755 | ||
151 | +Trying genericCitationLabelled | ||
152 | +** fetching reprint using the 'generic citation labelled' finder... | ||
153 | +** fetching of reprint 26673755 succeeded | ||
154 | +Trying to fetch pmid 28061857 | ||
155 | +Trying genericCitationLabelled | ||
156 | +** fetching reprint using the 'generic citation labelled' finder... | ||
157 | +** fetching of reprint 28061857 succeeded | ||
158 | +Trying to fetch pmid 28526842 | ||
159 | +Trying genericCitationLabelled | ||
160 | +** fetching reprint using the 'generic citation labelled' finder... | ||
161 | +** fetching of reprint 28526842 succeeded | ||
162 | +Trying to fetch pmid 29394395 | ||
163 | +Trying genericCitationLabelled | ||
164 | +Trying pubmed_central_v2 | ||
165 | +** fetching reprint using the 'pubmed central' finder... | ||
166 | +** fetching of reprint 29394395 succeeded | ||
167 | +Trying to fetch pmid 30137486 | ||
168 | +Trying genericCitationLabelled | ||
169 | +Trying pubmed_central_v2 | ||
170 | +** fetching reprint using the 'pubmed central' finder... | ||
171 | +** fetching of reprint 30137486 succeeded | ||
172 | +Trying to fetch pmid 30389436 | ||
173 | +Trying genericCitationLabelled | ||
174 | +Trying pubmed_central_v2 | ||
175 | +Trying acsPublications | ||
176 | +Trying uchicagoPress | ||
177 | +Trying nejm | ||
178 | +Trying futureMedicine | ||
179 | +Trying science_direct | ||
180 | +Trying direct_pdf_link | ||
181 | +** Reprint 30389436 could not be fetched with the current finders. | ||
182 | +Trying to fetch pmid 30420454 | ||
183 | +Trying genericCitationLabelled | ||
184 | +Trying pubmed_central_v2 | ||
185 | +Trying acsPublications | ||
186 | +Trying uchicagoPress | ||
187 | +Trying nejm | ||
188 | +Trying futureMedicine | ||
189 | +Trying science_direct | ||
190 | +** fetching of reprint 30420454 failed from error Invalid URL 'Ldk64CcSIXJkCs8pPt2YjpLkZJhaKs_m0pmekzL5SOY-1672941544-0-AaBQfP_66yID5nRb-xUznbW2FVXAWeiOcKnTosB_FUSxeeNSucukyHbo7OxAcJttrfA4pKDGasC8MvQz6o0cFBZv0VU_RfYXUn7Z6iVg5eVp7n_O4P9Zzk0IiE132EMNR-Xn0_gEfYM8DMCX5lS4yEgrs9hwhdJIWzS6N3fsDsa3kDjIH9oELTaDEbTbFvUXEkx3212-4NJ6SwCvfUzhtolsD7xJoswFQHjNBFrmUgScEORQpIWTWxzHPvpGTxepQMUPuAEbgNNykNbdp9oyLMDwmUnIqU7hSmeCkYU1RWlbxh95rcgH-yvV9mm3RQnIXT3WfcUE9lM5crnbBcplVCA4jbLP7kk1tu_BFbh-6gstCr0B24gEE5zJ41WGxwTbABhAmK7aAeHbH7V55EBpLOQcpkYhWZNiMMbVsG314TM_tE9UGM8B99FmrWUqCqwMcsGwDDWK7B-uHcDD5nJxQhgV5SlMnS0IVE18Bdu4zqIzT3ZS2sgGf9Drti4P5Qkso3v1pW_fBzq-Mrd6_O7cvwF7FlRc95tOXSjjS0Woc70HGNBNd1kc0ZR9NuwV9TnvPRWbuoYu3HXz65DeWmbGaLOFdHAOUARr1fD9DL9LRDmeAHOGkYkplz9pSbWXR6vYkIInqFnvQKuwhnVOltaWa6_VG3BH0oc9T4xAZdH83DsG6eHtJlitVhH8Sx_PBfukG4x0S1qsmIWUPDwZhwUe55ly0I5ISELLL8Z3tAJpq3zrdyV6CbwOjF7-nPF7aRNuxg': No schema supplied. Perhaps you meant http://Ldk64CcSIXJkCs8pPt2YjpLkZJhaKs_m0pmekzL5SOY-1672941544-0-AaBQfP_66yID5nRb-xUznbW2FVXAWeiOcKnTosB_FUSxeeNSucukyHbo7OxAcJttrfA4pKDGasC8MvQz6o0cFBZv0VU_RfYXUn7Z6iVg5eVp7n_O4P9Zzk0IiE132EMNR-Xn0_gEfYM8DMCX5lS4yEgrs9hwhdJIWzS6N3fsDsa3kDjIH9oELTaDEbTbFvUXEkx3212-4NJ6SwCvfUzhtolsD7xJoswFQHjNBFrmUgScEORQpIWTWxzHPvpGTxepQMUPuAEbgNNykNbdp9oyLMDwmUnIqU7hSmeCkYU1RWlbxh95rcgH-yvV9mm3RQnIXT3WfcUE9lM5crnbBcplVCA4jbLP7kk1tu_BFbh-6gstCr0B24gEE5zJ41WGxwTbABhAmK7aAeHbH7V55EBpLOQcpkYhWZNiMMbVsG314TM_tE9UGM8B99FmrWUqCqwMcsGwDDWK7B-uHcDD5nJxQhgV5SlMnS0IVE18Bdu4zqIzT3ZS2sgGf9Drti4P5Qkso3v1pW_fBzq-Mrd6_O7cvwF7FlRc95tOXSjjS0Woc70HGNBNd1kc0ZR9NuwV9TnvPRWbuoYu3HXz65DeWmbGaLOFdHAOUARr1fD9DL9LRDmeAHOGkYkplz9pSbWXR6vYkIInqFnvQKuwhnVOltaWa6_VG3BH0oc9T4xAZdH83DsG6eHtJlitVhH8Sx_PBfukG4x0S1qsmIWUPDwZhwUe55ly0I5ISELLL8Z3tAJpq3zrdyV6CbwOjF7-nPF7aRNuxg? | ||
191 | +Trying to fetch pmid 33172971 | ||
192 | +Trying genericCitationLabelled | ||
193 | +Trying pubmed_central_v2 | ||
194 | +Trying acsPublications | ||
195 | +Trying uchicagoPress | ||
196 | +Trying nejm | ||
197 | +Trying futureMedicine | ||
198 | +Trying science_direct | ||
199 | +** fetching of reprint 33172971 failed from error Invalid URL 'cIp5BepthG7srw3ecaG_06Qhx8PrpoP6WpFwlAopfrE-1672941545-0-AXmWkR9H8dN8IxbsllbsHZ1SigvIVLyZ0euRPz15XW6nX_MsA3Y9dPoL0MovKxj_yUNiDnSYrVSmYNzVo-LEANJZq45ZpzDVv1GFU1qNu0PpI-0YxuWHz4dSudrD_soFz3LsTCtgLamU66ZDSsrVNGTaqqbajetYgnnhu4K-BeYnLmpOxzcMzYU9mgynHjFv0NnrdUU75kJPeOIRpgrUlqm8JRnkMq0SEvI2IPTDW0ToohbWs4bLvLX0GNKGVT_v5to_am4hEVPC9jmkfkkNOLoMmfbnZC-L2EGAKufwZgz17d89HWfaK61no8EW8y5ysZ5A9yTRfN__C_LpTG6FWw2HWyR9FgIvz799f4ysIoz52azp4a7w3G3AHCWdUBUDy6gabo_psIE4mu3dCHLcDzGNO148UT5wzxTfrQV3aatPAWjnaK6-Re0XOkABNINniMLfF6Ti-0WgY-cHyLH2RgKISy_89MeNrVJy22GToy2c_LQwZN3RT3M8M2TFXLXmi9xEE4Z_4kSRA_aRnvRjKJdMJfxhc-BYW1G-dn2SDAetNZZL7HcJW6cGlAIjNWQqTD9ieGfLxGJe0OCLysFkeY3XwRY5vTHQ-xVI-gGKBY0A9gS70DH5t_pS53fBTQZ1pK667ct-BCo5aysuQHLcXrlE9coo8k8vtQKrmQ5-Fxp2ZNV_MKLY5yqBj5yAWJI5b_O-Mp5TyE9Zzyte_cTXqYtO14DAr1ev8TwqZP3YNbunHBcvIO20uVjbNxc4m--ARi5MMuxcDg4Kvju8Dbf3YKM': No schema supplied. Perhaps you meant http://cIp5BepthG7srw3ecaG_06Qhx8PrpoP6WpFwlAopfrE-1672941545-0-AXmWkR9H8dN8IxbsllbsHZ1SigvIVLyZ0euRPz15XW6nX_MsA3Y9dPoL0MovKxj_yUNiDnSYrVSmYNzVo-LEANJZq45ZpzDVv1GFU1qNu0PpI-0YxuWHz4dSudrD_soFz3LsTCtgLamU66ZDSsrVNGTaqqbajetYgnnhu4K-BeYnLmpOxzcMzYU9mgynHjFv0NnrdUU75kJPeOIRpgrUlqm8JRnkMq0SEvI2IPTDW0ToohbWs4bLvLX0GNKGVT_v5to_am4hEVPC9jmkfkkNOLoMmfbnZC-L2EGAKufwZgz17d89HWfaK61no8EW8y5ysZ5A9yTRfN__C_LpTG6FWw2HWyR9FgIvz799f4ysIoz52azp4a7w3G3AHCWdUBUDy6gabo_psIE4mu3dCHLcDzGNO148UT5wzxTfrQV3aatPAWjnaK6-Re0XOkABNINniMLfF6Ti-0WgY-cHyLH2RgKISy_89MeNrVJy22GToy2c_LQwZN3RT3M8M2TFXLXmi9xEE4Z_4kSRA_aRnvRjKJdMJfxhc-BYW1G-dn2SDAetNZZL7HcJW6cGlAIjNWQqTD9ieGfLxGJe0OCLysFkeY3XwRY5vTHQ-xVI-gGKBY0A9gS70DH5t_pS53fBTQZ1pK667ct-BCo5aysuQHLcXrlE9coo8k8vtQKrmQ5-Fxp2ZNV_MKLY5yqBj5yAWJI5b_O-Mp5TyE9Zzyte_cTXqYtO14DAr1ev8TwqZP3YNbunHBcvIO20uVjbNxc4m--ARi5MMuxcDg4Kvju8Dbf3YKM? | ||
200 | +Trying to fetch pmid 34428301 | ||
201 | +Trying genericCitationLabelled | ||
202 | +Trying pubmed_central_v2 | ||
203 | +** fetching reprint using the 'pubmed central' finder... | ||
204 | +** fetching of reprint 34428301 succeeded | ||
205 | +Trying to fetch pmid 34791440 | ||
206 | +Trying genericCitationLabelled | ||
207 | +Trying pubmed_central_v2 | ||
208 | +** fetching reprint using the 'pubmed central' finder... | ||
209 | +** fetching of reprint 34791440 succeeded | ||
210 | +Trying to fetch pmid 9140061 | ||
211 | +Trying genericCitationLabelled | ||
212 | +Trying pubmed_central_v2 | ||
213 | +Trying acsPublications | ||
214 | +Trying uchicagoPress | ||
215 | +Trying nejm | ||
216 | +Trying futureMedicine | ||
217 | +Trying science_direct | ||
218 | +** fetching of reprint 9140061 failed from error Invalid URL 'jcZKP5wy_U5fZwQzMnXsls3TZuFKtLfM.TRt.Bh4d9k-1672941556-0-ARg_NFIhKoWkMSTkM9K0NNsEvXccNV6TpzvRoQ-2vYjK8XkBbOYBTbXDk7ayo2JehzHWQXv5Q_R2fac_6YNLMbXVLvOx_MPE2G55FZnUH5eyYoAVkc294_DbWF4BkOBr9bbRZ77KShHUYqJjAOi2O7mvSeGRhr8aCrq258YcVJ0FBdP8Q5tuy2CNWxi_udpInouGKC_Bnbb4D6LtrmOH2qchHRdKNei5ina55N2xPiH6jVDZ21jK0SkCSagtetSHnT7A-CfaFwqG5cz5lnOs1l1bBFEcOFdNNkmvz5yGZK-RR1-gynCmgS1ixfHapDjmCyogIfAxI1oumhPQoHCCg8-OqSgMSXHbgJdPWvc5L68Unmk5BAZNeFU2F_-xInoVtpYPwJNkeyldxj98PbHPAYg-SqmRtv0MyKm9qcEZJIULlfwTZ2ZGAm_uAwcQ7fW_O9VfUNBlbt2SohoYWfCtILAc2Imgon6vNbdisaxRkf70SZuD0G-Fj2SCsAYhkQrqPCdJAEEfWJ1QiddGb32kTSnXCoupFAWbX441Xj4nOj5OaRem_6JScd2AJp-YxSNI0Nm4IrB8s5O_lG1o_BDYlplFwbKozatP9ckn0jeXx38wInIuKOjUgl9B_T2Xvkg6sCNxXUWsHXiHMkhQ3x2AEh47zf4T6vQoTi0wNMkUVtkNTh8gOviKKl74Pi4m3yyq1ICnA9L9D6E6MLuE_ZOfmVBM79sVEgN8jsDBojevmYv96r09rQaQ_9c5cFZk7E25jQ': No schema supplied. Perhaps you meant http://jcZKP5wy_U5fZwQzMnXsls3TZuFKtLfM.TRt.Bh4d9k-1672941556-0-ARg_NFIhKoWkMSTkM9K0NNsEvXccNV6TpzvRoQ-2vYjK8XkBbOYBTbXDk7ayo2JehzHWQXv5Q_R2fac_6YNLMbXVLvOx_MPE2G55FZnUH5eyYoAVkc294_DbWF4BkOBr9bbRZ77KShHUYqJjAOi2O7mvSeGRhr8aCrq258YcVJ0FBdP8Q5tuy2CNWxi_udpInouGKC_Bnbb4D6LtrmOH2qchHRdKNei5ina55N2xPiH6jVDZ21jK0SkCSagtetSHnT7A-CfaFwqG5cz5lnOs1l1bBFEcOFdNNkmvz5yGZK-RR1-gynCmgS1ixfHapDjmCyogIfAxI1oumhPQoHCCg8-OqSgMSXHbgJdPWvc5L68Unmk5BAZNeFU2F_-xInoVtpYPwJNkeyldxj98PbHPAYg-SqmRtv0MyKm9qcEZJIULlfwTZ2ZGAm_uAwcQ7fW_O9VfUNBlbt2SohoYWfCtILAc2Imgon6vNbdisaxRkf70SZuD0G-Fj2SCsAYhkQrqPCdJAEEfWJ1QiddGb32kTSnXCoupFAWbX441Xj4nOj5OaRem_6JScd2AJp-YxSNI0Nm4IrB8s5O_lG1o_BDYlplFwbKozatP9ckn0jeXx38wInIuKOjUgl9B_T2Xvkg6sCNxXUWsHXiHMkhQ3x2AEh47zf4T6vQoTi0wNMkUVtkNTh8gOviKKl74Pi4m3yyq1ICnA9L9D6E6MLuE_ZOfmVBM79sVEgN8jsDBojevmYv96r09rQaQ_9c5cFZk7E25jQ? | ||
219 | +Trying to fetch pmid 32662815 | ||
220 | +Trying genericCitationLabelled | ||
221 | +Trying pubmed_central_v2 | ||
222 | +** fetching reprint using the 'pubmed central' finder... | ||
223 | +** fetching of reprint 32662815 succeeded | ||
224 | +Trying to fetch pmid 32817380 | ||
225 | +Trying genericCitationLabelled | ||
226 | +Trying pubmed_central_v2 | ||
227 | +Trying acsPublications | ||
228 | +Trying uchicagoPress | ||
229 | +Trying nejm | ||
230 | +Trying futureMedicine | ||
231 | +Trying science_direct | ||
232 | +** fetching of reprint 32817380 failed from error Invalid URL '12Kdq1Gu6s3H_c9wV7MWTwYP1d4sz7FRpWv08o2fehs-1672941561-0-AQzKoSKCDX92o3mW4XorQJa2qF1s8dsMn24r1239tDd-OxIEJ-xofWlfZb7cmDWmkZ-d4uCYOdMgyimJ9BwqBkuJKbRguJ_HaG4KzuT0CwTAflqmSgiP6oaZRbxRIMOl3LAnhQXawFffYKLbyEKG9hEWBeEbs31LlzwG7k7IbodBBPNfYicYC2QJy8RZ5xHWPTXxcwshhdG__QByEK9fJ6RYaR8LVhOwXo-m6nKcnmcvdFAubYorAvVvggpPCiIA0EYouK_-KA_Et9mXMtoRPVhEKeO03k9LAejSpvDDd8praPe4uYMGyBe4ruFtFbjqOdJgmlwSt_hPsHu_iFLkl6eW-V_dW5iwEQOE9z1jSjKf1ZHznUnde5Nzlh3v0wV2po1Y1QuFKuy8_IO-DB4iU3MlzHKgWqCsAeLorSaui7KqJAGzqmM3Keurq7J4URVd8khAGmHXMZHt3u96krRlFp3Nsc1_jwJEKKLxr44FVFla7XnqlQIHXdzj9FffjdPd1R_p3G-UEYGLzL32dFulkql4INTbOR625BrjoAvw74XDQRcNE_P72PYyCRUSIarPTtFTQBMSfpxRaOprcTZfMR_U5zdY0uGixU3srbPeduCUA7tQOFiCiLoTD_odsa75NYCv9o_me1vJSA823Md4hCV947suGwjybNaQP-R-yrffAfni7dQRYMt-mjEHk5LtnebhpJi1G44UN_WFSDpOkB6lvKO7Qc-eoUXnm4DbeysVDTRAmVi94HkcG9tc1U7BeSVyXNUfq3C1Vr_1jJXCgUI': No schema supplied. Perhaps you meant http://12Kdq1Gu6s3H_c9wV7MWTwYP1d4sz7FRpWv08o2fehs-1672941561-0-AQzKoSKCDX92o3mW4XorQJa2qF1s8dsMn24r1239tDd-OxIEJ-xofWlfZb7cmDWmkZ-d4uCYOdMgyimJ9BwqBkuJKbRguJ_HaG4KzuT0CwTAflqmSgiP6oaZRbxRIMOl3LAnhQXawFffYKLbyEKG9hEWBeEbs31LlzwG7k7IbodBBPNfYicYC2QJy8RZ5xHWPTXxcwshhdG__QByEK9fJ6RYaR8LVhOwXo-m6nKcnmcvdFAubYorAvVvggpPCiIA0EYouK_-KA_Et9mXMtoRPVhEKeO03k9LAejSpvDDd8praPe4uYMGyBe4ruFtFbjqOdJgmlwSt_hPsHu_iFLkl6eW-V_dW5iwEQOE9z1jSjKf1ZHznUnde5Nzlh3v0wV2po1Y1QuFKuy8_IO-DB4iU3MlzHKgWqCsAeLorSaui7KqJAGzqmM3Keurq7J4URVd8khAGmHXMZHt3u96krRlFp3Nsc1_jwJEKKLxr44FVFla7XnqlQIHXdzj9FffjdPd1R_p3G-UEYGLzL32dFulkql4INTbOR625BrjoAvw74XDQRcNE_P72PYyCRUSIarPTtFTQBMSfpxRaOprcTZfMR_U5zdY0uGixU3srbPeduCUA7tQOFiCiLoTD_odsa75NYCv9o_me1vJSA823Md4hCV947suGwjybNaQP-R-yrffAfni7dQRYMt-mjEHk5LtnebhpJi1G44UN_WFSDpOkB6lvKO7Qc-eoUXnm4DbeysVDTRAmVi94HkcG9tc1U7BeSVyXNUfq3C1Vr_1jJXCgUI? | ||
233 | +Trying to fetch pmid 32849447 | ||
234 | +Trying genericCitationLabelled | ||
235 | +Trying pubmed_central_v2 | ||
236 | +** fetching reprint using the 'pubmed central' finder... | ||
237 | +** fetching of reprint 32849447 succeeded | ||
238 | +Trying to fetch pmid 33068046 | ||
239 | +Trying genericCitationLabelled | ||
240 | +Trying pubmed_central_v2 | ||
241 | +Trying acsPublications | ||
242 | +Trying uchicagoPress | ||
243 | +Trying nejm | ||
244 | +Trying futureMedicine | ||
245 | +Trying science_direct | ||
246 | +** fetching of reprint 33068046 failed from error Invalid URL 'Ca2tz3FARdmZBsByuyNuWeiql2uee1VreT.kVjY7yrk-1672941567-0-AexEfZqeSZGpnAKuvb4N24mFbbARpMqS4Bl7rq2oJaJLQy4XNqEXY1SvQ53OXwzuh9s8hJpJSmZKZ90s8So4WTMitRZFt0iwKRvwq5PfF8ZF-spmYvUyZmqSAcRty7hyAnlIItHCbvd0DXymu2foqGLiY7_Azyn4oIZjqDWZgwUu4cttCsPTlTJtscKhrnIDiTC2AD-6BrcAHq2eFMQXn27imPIx1RCRlJshGeDr1vbtfjlBg89wEfvUQMpUEgz-xVlFP2tkES_AqE3RIqDBCDIDkDuwxhKZ5d-k_PxAuN3Vbx-1nlLI7WeIZH3b-qHkPWg8ifOx6RsMU_A02ZEHMrjlftm66SFQ60Wsria5dpTeLxvGd34BBngLodgDKaYoG0ztHkPImcz4lT76J7-QCgKcV7O86u_4mEpHhONMbCRBLtVhcFVAX-zAMIyOWzECJ6x0Sau9cAqssr2l_Q1VT-f4uCaFA5KpmuC3IHUZQABkrvM9nh0uOhB2e7ln9OfxBG89KhjhGPRhio2LRDY4yprcBdzS-dNl1pedPEXENepuOg0R645bq0poGP4uKeYHuQ': No schema supplied. Perhaps you meant http://Ca2tz3FARdmZBsByuyNuWeiql2uee1VreT.kVjY7yrk-1672941567-0-AexEfZqeSZGpnAKuvb4N24mFbbARpMqS4Bl7rq2oJaJLQy4XNqEXY1SvQ53OXwzuh9s8hJpJSmZKZ90s8So4WTMitRZFt0iwKRvwq5PfF8ZF-spmYvUyZmqSAcRty7hyAnlIItHCbvd0DXymu2foqGLiY7_Azyn4oIZjqDWZgwUu4cttCsPTlTJtscKhrnIDiTC2AD-6BrcAHq2eFMQXn27imPIx1RCRlJshGeDr1vbtfjlBg89wEfvUQMpUEgz-xVlFP2tkES_AqE3RIqDBCDIDkDuwxhKZ5d-k_PxAuN3Vbx-1nlLI7WeIZH3b-qHkPWg8ifOx6RsMU_A02ZEHMrjlftm66SFQ60Wsria5dpTeLxvGd34BBngLodgDKaYoG0ztHkPImcz4lT76J7-QCgKcV7O86u_4mEpHhONMbCRBLtVhcFVAX-zAMIyOWzECJ6x0Sau9cAqssr2l_Q1VT-f4uCaFA5KpmuC3IHUZQABkrvM9nh0uOhB2e7ln9OfxBG89KhjhGPRhio2LRDY4yprcBdzS-dNl1pedPEXENepuOg0R645bq0poGP4uKeYHuQ? | ||
247 | +Trying to fetch pmid 33072717 | ||
248 | +Trying genericCitationLabelled | ||
249 | +Trying pubmed_central_v2 | ||
250 | +** fetching reprint using the 'pubmed central' finder... | ||
251 | +** fetching of reprint 33072717 succeeded | ||
252 | +Trying to fetch pmid 33136147 | ||
253 | +Trying genericCitationLabelled | ||
254 | +** fetching reprint using the 'generic citation labelled' finder... | ||
255 | +** fetching of reprint 33136147 succeeded | ||
256 | +Trying to fetch pmid 33318048 | ||
257 | +Trying genericCitationLabelled | ||
258 | +Trying pubmed_central_v2 | ||
259 | +Trying acsPublications | ||
260 | +Trying uchicagoPress | ||
261 | +Trying nejm | ||
262 | +Trying futureMedicine | ||
263 | +Trying science_direct | ||
264 | +** fetching of reprint 33318048 failed from error Invalid URL 'H_HDOMjPFqBRBn9CIvFp3.MLyt6.Cr1yqjPqy7_.dNo-1672941585-0-ASGJyet_JMjh-n9RCjZP1usTaU-rAh_oVPlNBV8Ox06oZLjLmr4nLazOPGibTSzbDun4wRRfxjJD1cl8pFLvWgZNLCwgScfdMEuTEYcHelG8wh84ZPO7-PimWyY4a-Ax_JW2wfMsWOrdcFRmKRfdjpL4MFDyEGVMcjhzP9y84LW4EnDyNZqVSkX_y8VAxIbmaMeuS-EiSakyeV1RnV4_bKjzzuQXXLtk8wIhc-rF2VoAiFgTRP3-kR7Y02rLN1opo-OYhoQ29Xy2fAHIKm2pS-qBW0XNRWiOU6q8_YMmMZrWbskiukxzgyZO5MutUF8ygYDuzaZDjX0BtuezjJEtcKWslPbaM1gXj1L8Yy3U7YCwi-_CPUjNOrvFnW0EEm5jeKDUVwwIeY1-sd54wUjlnn86c6qAqpaI4unKjLk4makfoIUlKr4B62VwsTRnrfbZxbqDTyl5jZjFIGiHmrmzPXxt1QG7SrQwApYoGQYFiijEUTw-7IM3t7bXcwRYTMVfbXUEhv8JrzvShSa8x1fDEwHgU2fUnY6BoCOrpC9hZShy1xlZSOOpw4AHgCW272GoFIr2PJ-Zy1UNze2TXebaUegtUpleiM-BhsDhqCAaGxAj9SQsD153z7wtiM6kCMnHOz9IhaKIkgYpKYgXwcQmuzLZUWgWJFJ0lqYeSgvlKAgHjzRY_3Jt2gPT3L2GcgUZQXWWRx4Hs4jL2tUvAiOuqPfvPWSFVjGTZPjZCd3VVFrqpcZCh2v85PiksdgNk05aMA': No schema supplied. Perhaps you meant http://H_HDOMjPFqBRBn9CIvFp3.MLyt6.Cr1yqjPqy7_.dNo-1672941585-0-ASGJyet_JMjh-n9RCjZP1usTaU-rAh_oVPlNBV8Ox06oZLjLmr4nLazOPGibTSzbDun4wRRfxjJD1cl8pFLvWgZNLCwgScfdMEuTEYcHelG8wh84ZPO7-PimWyY4a-Ax_JW2wfMsWOrdcFRmKRfdjpL4MFDyEGVMcjhzP9y84LW4EnDyNZqVSkX_y8VAxIbmaMeuS-EiSakyeV1RnV4_bKjzzuQXXLtk8wIhc-rF2VoAiFgTRP3-kR7Y02rLN1opo-OYhoQ29Xy2fAHIKm2pS-qBW0XNRWiOU6q8_YMmMZrWbskiukxzgyZO5MutUF8ygYDuzaZDjX0BtuezjJEtcKWslPbaM1gXj1L8Yy3U7YCwi-_CPUjNOrvFnW0EEm5jeKDUVwwIeY1-sd54wUjlnn86c6qAqpaI4unKjLk4makfoIUlKr4B62VwsTRnrfbZxbqDTyl5jZjFIGiHmrmzPXxt1QG7SrQwApYoGQYFiijEUTw-7IM3t7bXcwRYTMVfbXUEhv8JrzvShSa8x1fDEwHgU2fUnY6BoCOrpC9hZShy1xlZSOOpw4AHgCW272GoFIr2PJ-Zy1UNze2TXebaUegtUpleiM-BhsDhqCAaGxAj9SQsD153z7wtiM6kCMnHOz9IhaKIkgYpKYgXwcQmuzLZUWgWJFJ0lqYeSgvlKAgHjzRY_3Jt2gPT3L2GcgUZQXWWRx4Hs4jL2tUvAiOuqPfvPWSFVjGTZPjZCd3VVFrqpcZCh2v85PiksdgNk05aMA? | ||
265 | +``` | ||
266 | + | ||
267 | +# Text extraction from PDF | ||
268 | +We sent PDf files to Lisen&Curate team for extracting text. | ||
269 | + |
1 | +21097887 | ||
2 | +23818864 | ||
3 | +24947454 | ||
4 | +25222563 | ||
5 | +25275371 | ||
6 | +25735747 | ||
7 | +26258987 | ||
8 | +26279566 | ||
9 | +26670385 | ||
10 | +26673755 | ||
11 | +28061857 | ||
12 | +28526842 | ||
13 | +29394395 | ||
14 | +30137486 | ||
15 | +30389436 | ||
16 | +30420454 | ||
17 | +33172971 | ||
18 | +34428301 | ||
19 | +34791440 | ||
20 | +9140061 | ||
21 | +32662815 | ||
22 | +32817380 | ||
23 | +32849447 | ||
24 | +33068046 | ||
25 | +33072717 | ||
26 | +33136147 | ||
27 | +33318048 |
-
Please register or login to post a comment