cmendezc

Gathering input document collection

1 +import stanza
2 +import argparse
3 +import re
4 +import os
5 +import pandas as pd
6 +
7 +# Objective
8 +# Sentences extraction from XML Soft files.
9 +#
10 +# Input parameters
11 +# --inputPath=PATH Path to XML Soft files
12 +# --outputPath=PATH Path to place output files
13 +#
14 +# Output
15 +# Files with sentences obtained from XML Soft files
16 +#
17 +# Examples
18 +# python extract-sentences-from-softfiles.py
19 +# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
20 +# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
21 +#
22 +# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
23 +
24 +##########################################
25 +# MAIN PROGRAM #
26 +##########################################
27 +
28 +if __name__ == "__main__":
29 + # Defining parameters
30 + parser = argparse.ArgumentParser(
31 + prog='extract-sentences-from-softfiles',
32 + description='Sentences extraction from XML Soft files.',
33 + epilog='')
34 + parser.add_argument("--inputPath", dest="inputPath",
35 + help="Path to XML Soft files", metavar="PATH")
36 + parser.add_argument("--outputPath", dest="outputPath",
37 + help="Path for output files", metavar="PATH")
38 +
39 + args = parser.parse_args()
40 +
41 + print('-------------------------------- PARAMETERS --------------------------------')
42 + print("Path to XML Soft files: " + args.inputPath)
43 + print("Path to output files: " + args.outputPath)
44 + print('-------------------------------- PROCESSING --------------------------------')
45 +
46 + ## Tags of GCs into consideration
47 + # culture medium, medium supplements, aeration, temperature,
48 + # pH, agitation, growth phase, optical density, genetic background
49 + tags = {
50 + '<Gtype>': 'Gtype',
51 + # '<Gversion>': 'Gversion',
52 + '<Med>': 'Med',
53 + '<Phase>': 'Phase',
54 + # '<Substrain>': 'Substrain',
55 + '<Supp>': 'Supp',
56 + # '<Strain>': 'Strain',
57 + # '<Technique>': 'Technique',
58 + '<Temp>': 'Temp',
59 + '<OD>': 'OD',
60 + '<Anti>': 'Anti',
61 + '<Agit>': 'Agit',
62 + '<Air>': 'Air',
63 + '<Vess>': 'Vess',
64 + '<pH>': 'pH'
65 + }
66 + #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
67 + # '<Temp>', '<OD>', '<Anti>', '<Agit>',
68 + # '<Air>', '<Vess>', '<pH>']
69 + #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
70 + tags = ['Gtype', 'Med', 'Phase', 'Supp',
71 + 'Temp', 'OD', 'Anti', 'Agit',
72 + 'Air', 'Vess', 'pH']
73 + deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique']
74 + all_tags = tags + deleted_tags
75 + # Regex to check if line has a tag
76 + regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
77 + # Regex to delete tags
78 + regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
79 + # Regex to substitute tags
80 + regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
81 + regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
82 + #p = re.compile(r'blue (?P<animal>dog|cat)')
83 + #p.sub(r'gray \g<animal>', s)
84 + # Regex to tag GCs
85 + regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
86 + regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
87 +
88 + # Testing file: GSE54899_family_retagged-05242019_validated.xml
89 + testing_file = "GSE54899_family_retagged-05242019_validated.xml"
90 +
91 + # Define stanza pipeline for sentence segmentation
92 + nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
93 + # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
94 + nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
95 +
96 + # Store field_name (bangline) and field_text
97 + field_name = ""
98 + field_text = ""
99 +
100 + # Store list of unique field_name
101 + hash_field_name = {}
102 +
103 + # Store sentences from fields that contained at least one GC tag.
104 + # We want to use this list for someone to check it
105 + df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
106 +
107 + # Store serie number
108 + # ^SERIES = GSE54899
109 + serie = ""
110 + # Store series pubmed id
111 + # !Series_pubmed_id = 25222563
112 + serie_pubmed_id = ""
113 + # Store sample
114 + # ^SAMPLE = GSM1326335
115 + sample = ""
116 +
117 + for path, dirs, files in os.walk(args.inputPath):
118 + # For each file in dir
119 + for file in files:
120 + if file == testing_file:
121 + print(" Reading file..." + str(file))
122 + with open(os.path.join(args.inputPath, file)) as iFile:
123 + for line in iFile:
124 + line = line.rstrip('\n')
125 + if line.find(" = ") == -1:
126 + continue
127 + list_line = line.split(" = ")
128 + field_name = list_line[0]
129 + #print("field_name: {}".format(field_name))
130 + field_text = list_line[1]
131 + #print("field_text: {}".format(field_text))
132 + if field_name == "^SERIES":
133 + serie = field_text
134 + elif field_name == "!Series_pubmed_id":
135 + serie_pubmed_id = field_text
136 + elif field_name == "^SAMPLE":
137 + sample = field_text
138 + elif regex_has_tag.search(line): # Contains GC tag
139 + if field_name in hash_field_name:
140 + hash_field_name[field_name] += 1
141 + else:
142 + hash_field_name[field_name] = 1
143 + original_sentence = field_text
144 + # delete GC tags
145 + modified_sentence = regex_delete_tag.sub("", field_text)
146 + modified_sentence = regex_delete_tag.sub("", modified_sentence)
147 + # substitute tags
148 + # p = re.compile(r'blue (?P<animal>dog|cat)')
149 + # p.sub(r'gray \g<animal>', s)
150 + modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
151 + modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
152 + doc = nlp(modified_sentence)
153 + for i, sentence in enumerate(doc.sentences):
154 + # print(sentence.text)
155 + list_transformed_sentence = []
156 + # For GC tag
157 + gc_tag = "O"
158 + in_tag = False
159 + for word in sentence.words:
160 + result = regex_gc_ini_tag.match(word.text)
161 + if result:
162 + gc_tag = result.group("tag")
163 + in_tag = True
164 + continue
165 + else:
166 + result = regex_gc_end_tag.match(word.text)
167 + if result:
168 + gc_tag = "O"
169 + in_tag = False
170 + continue
171 + else:
172 + if not in_tag:
173 + gc_tag = "O"
174 + list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
175 + transformed_sentence = " ".join(list_transformed_sentence)
176 + new_row = {'serie': serie,
177 + 'serie_pubmed_id': serie_pubmed_id,
178 + 'sample': sample,
179 + 'field_name': field_name,
180 + 'original_sentence': original_sentence,
181 + 'modified_sentence': sentence.text,
182 + 'transformed_sentence': transformed_sentence}
183 + df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
184 + df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
185 + #print(token)
186 + quit()
187 +
188 + ## End of tagging
189 + out_labels = {
190 + '</Gtype>': 'O',
191 + '</Gversion>': 'O',
192 + '</Med>': 'O',
193 + '</Phase>': 'O',
194 + '</Substrain>': 'O',
195 + '</Supp>': 'O',
196 + '</Strain>': 'O',
197 + '</Technique>': 'O',
198 + '</Temp>': 'O',
199 + '</OD>': 'O',
200 + '</Anti>': 'O',
201 + '</Agit>': 'O',
202 + '</Air>': 'O',
203 + '</Vess>': 'O',
204 + '</pH>': 'O'}
205 + old_labels = {
206 + '<Orgn>': 'O',
207 + '</Orgn>': 'O'
208 + }
209 +
210 + # Other label
211 + flag = 'O'
212 + lista = []
213 + # First sentence
214 + sentence = ''
215 + n = 0
216 + with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file:
217 + for line in input_file:
218 + if len(line.split('\t')) > 1:
219 + w = line.split('\t')[1]
220 + if w in in_labels or w in out_labels:
221 + # Tagging
222 + if w in in_labels.keys(): flag = in_labels[w]
223 + if w in out_labels: flag = out_labels[w]
224 + else:
225 + if w == "PGCGROWTHCONDITIONS":
226 + n = n + 1
227 + words = sentence.split(' ')
228 + # End of sentence
229 + tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()]
230 + # At least one true-tag on sentence
231 + if len(tags) > 0:
232 + lista.append(sentence)
233 + # New setence
234 + sentence = ''
235 + elif w not in old_labels.keys():
236 + # Building and save tagging sentence
237 + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ')
238 +
239 + print("Number of sentences with at least one tag: " + str(len(lista)))
240 + print("Number of sentences from CoreNLP: " + str(n))
241 +
242 + # Split 70 30 training and test sentences
243 + trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70))
244 + testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
245 + print("Number of sentences for training: " + str(len(trainingIndex)))
246 + print("Number of sentences for test: " + str(len(testIndex)))
247 +
248 + with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile:
249 + Data = [lista[i] for i in trainingIndex]
250 + oFile.write('\n'.join(Data))
251 +
252 + with open(os.path.join(args.outputPath, args.testFile), "w") as oFile:
253 + Data = [lista[i] for i in testIndex]
254 + oFile.write('\n'.join(Data))
255 +
256 + print("==================================END===================================")
1 +import stanza
2 +import argparse
3 +import re
4 +import os
5 +import pandas as pd
6 +
7 +# Objective
8 +# Sentences extraction from XML Soft files.
9 +#
10 +# Input parameters
11 +# --inputPath=PATH Path to XML Soft files
12 +# --outputPath=PATH Path to place output files
13 +#
14 +# Output
15 +# Files with sentences obtained from XML Soft files
16 +#
17 +# Examples
18 +# python extract-sentences-from-softfiles.py
19 +# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
20 +# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
21 +#
22 +# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
23 +
24 +##########################################
25 +# MAIN PROGRAM #
26 +##########################################
27 +
28 +if __name__ == "__main__":
29 + # Defining parameters
30 + parser = argparse.ArgumentParser(
31 + prog='extract-sentences-from-softfiles',
32 + description='Sentences extraction from XML Soft files.',
33 + epilog='')
34 + parser.add_argument("--inputPath", dest="inputPath",
35 + help="Path to XML Soft files", metavar="PATH")
36 + parser.add_argument("--outputPath", dest="outputPath",
37 + help="Path for output files", metavar="PATH")
38 +
39 + args = parser.parse_args()
40 +
41 + print('-------------------------------- PARAMETERS --------------------------------')
42 + print("Path to XML Soft files: " + args.inputPath)
43 + print("Path to output files: " + args.outputPath)
44 + print('-------------------------------- PROCESSING --------------------------------')
45 +
46 + ## Tags of GCs into consideration
47 + # culture medium, medium supplements, aeration, temperature,
48 + # pH, agitation, growth phase, optical density, genetic background
49 + tags = {
50 + '<Gtype>': 'Gtype',
51 + # '<Gversion>': 'Gversion',
52 + '<Med>': 'Med',
53 + '<Phase>': 'Phase',
54 + # '<Substrain>': 'Substrain',
55 + '<Supp>': 'Supp',
56 + # '<Strain>': 'Strain',
57 + # '<Technique>': 'Technique',
58 + '<Temp>': 'Temp',
59 + '<OD>': 'OD',
60 + '<Anti>': 'Anti',
61 + '<Agit>': 'Agit',
62 + '<Air>': 'Air',
63 + '<Vess>': 'Vess',
64 + '<pH>': 'pH'
65 + }
66 + #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
67 + # '<Temp>', '<OD>', '<Anti>', '<Agit>',
68 + # '<Air>', '<Vess>', '<pH>']
69 + #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
70 + tags = ['Gtype', 'Med', 'Phase', 'Supp',
71 + 'Temp', 'OD', 'Anti', 'Agit',
72 + 'Air', 'Vess', 'pH']
73 + deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique']
74 + all_tags = tags + deleted_tags
75 + # Regex to check if line has a tag
76 + regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
77 + # Regex to delete tags
78 + regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
79 + # Regex to substitute tags
80 + regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
81 + regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
82 + #p = re.compile(r'blue (?P<animal>dog|cat)')
83 + #p.sub(r'gray \g<animal>', s)
84 + # Regex to tag GCs
85 + regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
86 + regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
87 +
88 + # Testing file: GSE54899_family_retagged-05242019_validated.xml
89 + testing_file = "GSE54899_family_retagged-05242019_validated.xml"
90 +
91 + # Define stanza pipeline for sentence segmentation
92 + nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
93 + # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
94 + nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
95 +
96 + # Store field_name (bangline) and field_text
97 + field_name = ""
98 + field_text = ""
99 +
100 + # Store list of unique field_name
101 + hash_field_name = {}
102 +
103 + # Store sentences from fields that contained at least one GC tag.
104 + # We want to use this list for someone to check it
105 + df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
106 +
107 + # Store serie number
108 + # ^SERIES = GSE54899
109 + serie = ""
110 + # Store series pubmed id
111 + # !Series_pubmed_id = 25222563
112 + serie_pubmed_id = ""
113 + # Store sample
114 + # ^SAMPLE = GSM1326335
115 + sample = ""
116 +
117 + for path, dirs, files in os.walk(args.inputPath):
118 + # For each file in dir
119 + for file in files:
120 + if file == testing_file:
121 + print(" Reading file..." + str(file))
122 + with open(os.path.join(args.inputPath, file)) as iFile:
123 + for line in iFile:
124 + line = line.rstrip('\n')
125 + if line.find(" = ") == -1:
126 + continue
127 + list_line = line.split(" = ")
128 + field_name = list_line[0]
129 + #print("field_name: {}".format(field_name))
130 + field_text = list_line[1]
131 + #print("field_text: {}".format(field_text))
132 + if field_name == "^SERIES":
133 + serie = field_text
134 + elif field_name == "!Series_pubmed_id":
135 + serie_pubmed_id = field_text
136 + elif field_name == "^SAMPLE":
137 + sample = field_text
138 + elif regex_has_tag.search(line): # Contains GC tag
139 + if field_name in hash_field_name:
140 + hash_field_name[field_name] += 1
141 + else:
142 + hash_field_name[field_name] = 1
143 + original_sentence = field_text
144 + # delete GC tags
145 + modified_sentence = regex_delete_tag.sub("", field_text)
146 + modified_sentence = regex_delete_tag.sub("", modified_sentence)
147 + # substitute tags
148 + # p = re.compile(r'blue (?P<animal>dog|cat)')
149 + # p.sub(r'gray \g<animal>', s)
150 + modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
151 + modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
152 + doc = nlp(modified_sentence)
153 + for i, sentence in enumerate(doc.sentences):
154 + # print(sentence.text)
155 + list_transformed_sentence = []
156 + # For GC tag
157 + gc_tag = "O"
158 + in_tag = False
159 + for word in sentence.words:
160 + result = regex_gc_ini_tag.match(word.text)
161 + if result:
162 + gc_tag = result.group("tag")
163 + in_tag = True
164 + continue
165 + else:
166 + result = regex_gc_end_tag.match(word.text)
167 + if result:
168 + gc_tag = "O"
169 + in_tag = False
170 + continue
171 + else:
172 + if not in_tag:
173 + gc_tag = "O"
174 + list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
175 + transformed_sentence = " ".join(list_transformed_sentence)
176 + new_row = {'serie': serie,
177 + 'serie_pubmed_id': serie_pubmed_id,
178 + 'sample': sample,
179 + 'field_name': field_name,
180 + 'original_sentence': original_sentence,
181 + 'modified_sentence': sentence.text,
182 + 'transformed_sentence': transformed_sentence}
183 + df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
184 + df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
185 + #print(token)
186 + quit()
187 +
188 + ## End of tagging
189 + out_labels = {
190 + '</Gtype>': 'O',
191 + '</Gversion>': 'O',
192 + '</Med>': 'O',
193 + '</Phase>': 'O',
194 + '</Substrain>': 'O',
195 + '</Supp>': 'O',
196 + '</Strain>': 'O',
197 + '</Technique>': 'O',
198 + '</Temp>': 'O',
199 + '</OD>': 'O',
200 + '</Anti>': 'O',
201 + '</Agit>': 'O',
202 + '</Air>': 'O',
203 + '</Vess>': 'O',
204 + '</pH>': 'O'}
205 + old_labels = {
206 + '<Orgn>': 'O',
207 + '</Orgn>': 'O'
208 + }
209 +
210 + # Other label
211 + flag = 'O'
212 + lista = []
213 + # First sentence
214 + sentence = ''
215 + n = 0
216 + with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file:
217 + for line in input_file:
218 + if len(line.split('\t')) > 1:
219 + w = line.split('\t')[1]
220 + if w in in_labels or w in out_labels:
221 + # Tagging
222 + if w in in_labels.keys(): flag = in_labels[w]
223 + if w in out_labels: flag = out_labels[w]
224 + else:
225 + if w == "PGCGROWTHCONDITIONS":
226 + n = n + 1
227 + words = sentence.split(' ')
228 + # End of sentence
229 + tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()]
230 + # At least one true-tag on sentence
231 + if len(tags) > 0:
232 + lista.append(sentence)
233 + # New setence
234 + sentence = ''
235 + elif w not in old_labels.keys():
236 + # Building and save tagging sentence
237 + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ')
238 +
239 + print("Number of sentences with at least one tag: " + str(len(lista)))
240 + print("Number of sentences from CoreNLP: " + str(n))
241 +
242 + # Split 70 30 training and test sentences
243 + trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70))
244 + testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
245 + print("Number of sentences for training: " + str(len(trainingIndex)))
246 + print("Number of sentences for test: " + str(len(testIndex)))
247 +
248 + with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile:
249 + Data = [lista[i] for i in trainingIndex]
250 + oFile.write('\n'.join(Data))
251 +
252 + with open(os.path.join(args.outputPath, args.testFile), "w") as oFile:
253 + Data = [lista[i] for i in testIndex]
254 + oFile.write('\n'.join(Data))
255 +
256 + print("==================================END===================================")
1 +# Input article collection
2 +We used list of PMIDs from article collections delivered by curators (Víctor, Soco, Paloma).
3 +DRIVE (https://docs.google.com/spreadsheets/d/1OayfQ7ODgnU4d5PQ3SUAmFX3Tc27PocCHZ6flPXwLKc/edit?usp=sharing)
4 +Asana (https://app.asana.com/0/1200927210854847/1203428992254399/f)
5 +
6 +# Download PDFs
7 +We used [Pubmed-Batch-Download](https://github.com/billgreenwald/Pubmed-Batch-Download) tool to download PDF files.
8 +## Installation
9 +```shell
10 +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ git clone https://github.com/billgreenwald/Pubmed-Batch-Download.git
11 +Cloning into 'Pubmed-Batch-Download'...
12 +remote: Enumerating objects: 202, done.
13 +remote: Counting objects: 100% (12/12), done.
14 +remote: Compressing objects: 100% (12/12), done.
15 +remote: Total 202 (delta 5), reused 0 (delta 0), pack-reused 190
16 +Receiving objects: 100% (202/202), 31.23 MiB | 1.09 MiB/s, done.
17 +Resolving deltas: 100% (102/102), done.
18 +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ mv Pubmed-Batch-Download/ github-Pubmed-Batch-Download
19 +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ cd github-Pubmed-Batch-Download/
20 +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ ls -l
21 +total 52
22 +-rw-rw-r-- 1 cmendezc cmendezc 72 ene 5 11:31 example_pmf.tsv
23 +-rw-rw-r-- 1 cmendezc cmendezc 11430 ene 5 11:31 fetch_pdfs.py
24 +-rw-rw-r-- 1 cmendezc cmendezc 18711 ene 5 11:31 fetch_pdfs_toScript.ipynb
25 +-rw-rw-r-- 1 cmendezc cmendezc 551 ene 5 11:31 pubmed-batch-downloader-py3-windows.yml
26 +-rw-rw-r-- 1 cmendezc cmendezc 895 ene 5 11:31 pubmed-batch-downloader-py3.yml
27 +-rw-rw-r-- 1 cmendezc cmendezc 3667 ene 5 11:31 README.md
28 +drwxrwxr-x 2 cmendezc cmendezc 4096 ene 5 11:31 ruby_version
29 +-rw-rw-r-- 1 cmendezc cmendezc 0 ene 5 11:31 unfetched_pmids.tsv
30 +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda env create -f pubmed-batch-downloader-py3.yml
31 +```
32 +## Testing
33 +Error!
34 +```shell
35 +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda activate pubmed-batch-downloader-py3
36 +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ python fetch_pdfs.py -pmf example_pmf.tsv
37 +Traceback (most recent call last):
38 + File "fetch_pdfs.py", line 64, in <module>
39 + from bs4 import BeautifulSoup
40 +ModuleNotFoundError: No module named 'bs4'
41 +```
42 +Fix 1: Install bs4
43 +```shell
44 +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ pip install bs4
45 +Collecting bs4
46 + Using cached https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
47 +Collecting beautifulsoup4 (from bs4)
48 + Using cached https://files.pythonhosted.org/packages/9c/d8/909c4089dbe4ade9f9705f143c9f13f065049a9d5e7d34c828aefdd0a97c/beautifulsoup4-4.11.1-py3-none-any.whl
49 +Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
50 + Using cached https://files.pythonhosted.org/packages/16/e3/4ad79882b92617e3a4a0df1960d6bce08edfb637737ac5c3f3ba29022e25/soupsieve-2.3.2.post1-py3-none-any.whl
51 +Building wheels for collected packages: bs4
52 + Building wheel for bs4 (setup.py) ... done
53 + Stored in directory: /home/cmendezc/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
54 +Successfully built bs4
55 +Installing collected packages: soupsieve, beautifulsoup4, bs4
56 +Successfully installed beautifulsoup4-4.11.1 bs4-0.0.1 soupsieve-2.3.2.post1
57 +```
58 +Error!
59 +```shell
60 +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ python fetch_pdfs.py -pmf example_pmf.tsv
61 +Output directory of fetched_pdfs did not exist. Created the directory.
62 +Trying to fetch pmid 27547345
63 +** fetching of reprint 27547345 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
64 +Trying to fetch pmid 22610656
65 +** fetching of reprint 22610656 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
66 +Trying to fetch pmid 23858657
67 +** fetching of reprint 23858657 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
68 +Trying to fetch pmid 24998529
69 +** fetching of reprint 24998529 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
70 +Trying to fetch pmid 27859194
71 +** fetching of reprint 27859194 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
72 +Trying to fetch pmid 26991916
73 +** fetching of reprint 26991916 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
74 +Trying to fetch pmid 26742956
75 +** fetching of reprint 26742956 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
76 +Trying to fetch pmid 28388874
77 +** fetching of reprint 28388874 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
78 +```
79 +Fix 2: Install
80 +```shell
81 +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ pip install lxml
82 +Collecting lxml
83 + Downloading https://files.pythonhosted.org/packages/4b/24/300d0fd5130cf55e5bbab2c53d339728370cb4ac12ca80a4f421c2e228eb/lxml-4.9.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (5.8MB)
84 + |████████████████████████████████| 5.8MB 2.7MB/s
85 +Installing collected packages: lxml
86 +Successfully installed lxml-4.9.2
87 +```
88 +It runs, but it didn't fetch all files. See unfetch_pmids.tsv
89 +
90 +## Run
91 +```shell
92 +(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda activate pubmed-batch-downloader-py3
93 +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ cd /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/extraction-literature/input/pdfs
94 +(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/gitlab-automatic-extraction-growth-conditions/extraction-literature/input/pdfs$ python /home/cmendezc/Documents/ccg/github-Pubmed-Batch-Download/fetch_pdfs.py -pmf ../list_of_PMIDs.txt
95 +Output directory of fetched_pdfs did not exist. Created the directory.
96 +Trying to fetch pmid 21097887
97 +Trying genericCitationLabelled
98 +Trying pubmed_central_v2
99 +** fetching reprint using the 'pubmed central' finder...
100 +** fetching of reprint 21097887 succeeded
101 +Trying to fetch pmid 23818864
102 +Trying genericCitationLabelled
103 +Trying pubmed_central_v2
104 +Trying acsPublications
105 +Trying uchicagoPress
106 +Trying nejm
107 +Trying futureMedicine
108 +Trying science_direct
109 +** fetching of reprint 23818864 failed from error list index out of range
110 +Trying to fetch pmid 24947454
111 +Trying genericCitationLabelled
112 +** fetching reprint using the 'generic citation labelled' finder...
113 +** fetching of reprint 24947454 succeeded
114 +Trying to fetch pmid 25222563
115 +Trying genericCitationLabelled
116 +** fetching reprint using the 'generic citation labelled' finder...
117 +** fetching of reprint 25222563 succeeded
118 +Trying to fetch pmid 25275371
119 +Trying genericCitationLabelled
120 +** fetching reprint using the 'generic citation labelled' finder...
121 +** fetching of reprint 25275371 succeeded
122 +Trying to fetch pmid 25735747
123 +Trying genericCitationLabelled
124 +Trying pubmed_central_v2
125 +** fetching reprint using the 'pubmed central' finder...
126 +** fetching of reprint 25735747 succeeded
127 +Trying to fetch pmid 26258987
128 +Trying genericCitationLabelled
129 +** fetching reprint using the 'generic citation labelled' finder...
130 +** fetching of reprint 26258987 succeeded
131 +Trying to fetch pmid 26279566
132 +Trying genericCitationLabelled
133 +Trying pubmed_central_v2
134 +Trying acsPublications
135 +Trying uchicagoPress
136 +Trying nejm
137 +Trying futureMedicine
138 +Trying science_direct
139 +Trying direct_pdf_link
140 +** Reprint 26279566 could not be fetched with the current finders.
141 +Trying to fetch pmid 26670385
142 +Trying genericCitationLabelled
143 +Trying pubmed_central_v2
144 +Trying acsPublications
145 +Trying uchicagoPress
146 +Trying nejm
147 +Trying futureMedicine
148 +Trying science_direct
149 +** fetching of reprint 26670385 failed from error Invalid URL 'f680gMuZlnwT4304lwsG531xpi4vbk83nDntGZ4l27M-1672941516-0-AdC4bMbcOc9cCSwDp4lsqirHOW3zv1msNUw8lijeZIxdN3BDTBUy983qf_LAfBVEhkt4k2Xwu_NYMJeaq3oG4LTyfDBxN2Ra-cmmTDVIK66GtTB9oyyn4GqMem1PTBwVEFtzEYcB4AOoR8EGbwWrEZa1jPMupBq_gJ0JlxuIGbBJw3SuuioKmRlQT_TDXNREjT2Av3DHrrz6C008shr-pgrTtAoM5aZ0N4clcoBQ1FWX04MZm-nPOxI-2zbxcHUYXqV91lbH7iWkztZWPcv6-Q3ePiFD6_-C7pdY_Mf0Y670kOKyhoqlZ0m3PqPm64-37r-nzxrcd2Z0MWJUMC8Jx1b1OA1e53TJy62F2K5ws3U82zktr4gEDS11A13r8DIn1wRCEH2dk8jI02NQoIp3JBTvUixhiNkWib01Zl7l7iAFLOJtWlVbeUsOwCh6imfV5m-2No7-SiGaur5Ip6Zf3ACDki_CjXifHxtGVh1TbvnYsBeUdoaWV3TsXdGvF7AVr_ytXg4-JiIHhaZ-SdCzpe65bWZmvwrIpCfZOEBOC-gNTm3tq5h1_2iQzVTinGQonsXdwLCYSKQeZRQ-qEFf7y4PpesHamAXmw1OZlZJtKlFgXx9MoCBp0Irx8ChWyIo5RhSBoa9j1_JW8AX1x7KDY3UX32ItW7-a2Qw5IEL_FRS6cyXOg1FLeHlanntIl11kKWmXyJ86bsvEQBn2Q9-1kMvisDZaM0LrfNT9KcghdkLgdpzsDEf-B4_MKdkVAhkBQ': No schema supplied. Perhaps you meant http://f680gMuZlnwT4304lwsG531xpi4vbk83nDntGZ4l27M-1672941516-0-AdC4bMbcOc9cCSwDp4lsqirHOW3zv1msNUw8lijeZIxdN3BDTBUy983qf_LAfBVEhkt4k2Xwu_NYMJeaq3oG4LTyfDBxN2Ra-cmmTDVIK66GtTB9oyyn4GqMem1PTBwVEFtzEYcB4AOoR8EGbwWrEZa1jPMupBq_gJ0JlxuIGbBJw3SuuioKmRlQT_TDXNREjT2Av3DHrrz6C008shr-pgrTtAoM5aZ0N4clcoBQ1FWX04MZm-nPOxI-2zbxcHUYXqV91lbH7iWkztZWPcv6-Q3ePiFD6_-C7pdY_Mf0Y670kOKyhoqlZ0m3PqPm64-37r-nzxrcd2Z0MWJUMC8Jx1b1OA1e53TJy62F2K5ws3U82zktr4gEDS11A13r8DIn1wRCEH2dk8jI02NQoIp3JBTvUixhiNkWib01Zl7l7iAFLOJtWlVbeUsOwCh6imfV5m-2No7-SiGaur5Ip6Zf3ACDki_CjXifHxtGVh1TbvnYsBeUdoaWV3TsXdGvF7AVr_ytXg4-JiIHhaZ-SdCzpe65bWZmvwrIpCfZOEBOC-gNTm3tq5h1_2iQzVTinGQonsXdwLCYSKQeZRQ-qEFf7y4PpesHamAXmw1OZlZJtKlFgXx9MoCBp0Irx8ChWyIo5RhSBoa9j1_JW8AX1x7KDY3UX32ItW7-a2Qw5IEL_FRS6cyXOg1FLeHlanntIl11kKWmXyJ86bsvEQBn2Q9-1kMvisDZaM0LrfNT9KcghdkLgdpzsDEf-B4_MKdkVAhkBQ?
150 +Trying to fetch pmid 26673755
151 +Trying genericCitationLabelled
152 +** fetching reprint using the 'generic citation labelled' finder...
153 +** fetching of reprint 26673755 succeeded
154 +Trying to fetch pmid 28061857
155 +Trying genericCitationLabelled
156 +** fetching reprint using the 'generic citation labelled' finder...
157 +** fetching of reprint 28061857 succeeded
158 +Trying to fetch pmid 28526842
159 +Trying genericCitationLabelled
160 +** fetching reprint using the 'generic citation labelled' finder...
161 +** fetching of reprint 28526842 succeeded
162 +Trying to fetch pmid 29394395
163 +Trying genericCitationLabelled
164 +Trying pubmed_central_v2
165 +** fetching reprint using the 'pubmed central' finder...
166 +** fetching of reprint 29394395 succeeded
167 +Trying to fetch pmid 30137486
168 +Trying genericCitationLabelled
169 +Trying pubmed_central_v2
170 +** fetching reprint using the 'pubmed central' finder...
171 +** fetching of reprint 30137486 succeeded
172 +Trying to fetch pmid 30389436
173 +Trying genericCitationLabelled
174 +Trying pubmed_central_v2
175 +Trying acsPublications
176 +Trying uchicagoPress
177 +Trying nejm
178 +Trying futureMedicine
179 +Trying science_direct
180 +Trying direct_pdf_link
181 +** Reprint 30389436 could not be fetched with the current finders.
182 +Trying to fetch pmid 30420454
183 +Trying genericCitationLabelled
184 +Trying pubmed_central_v2
185 +Trying acsPublications
186 +Trying uchicagoPress
187 +Trying nejm
188 +Trying futureMedicine
189 +Trying science_direct
190 +** fetching of reprint 30420454 failed from error Invalid URL 'Ldk64CcSIXJkCs8pPt2YjpLkZJhaKs_m0pmekzL5SOY-1672941544-0-AaBQfP_66yID5nRb-xUznbW2FVXAWeiOcKnTosB_FUSxeeNSucukyHbo7OxAcJttrfA4pKDGasC8MvQz6o0cFBZv0VU_RfYXUn7Z6iVg5eVp7n_O4P9Zzk0IiE132EMNR-Xn0_gEfYM8DMCX5lS4yEgrs9hwhdJIWzS6N3fsDsa3kDjIH9oELTaDEbTbFvUXEkx3212-4NJ6SwCvfUzhtolsD7xJoswFQHjNBFrmUgScEORQpIWTWxzHPvpGTxepQMUPuAEbgNNykNbdp9oyLMDwmUnIqU7hSmeCkYU1RWlbxh95rcgH-yvV9mm3RQnIXT3WfcUE9lM5crnbBcplVCA4jbLP7kk1tu_BFbh-6gstCr0B24gEE5zJ41WGxwTbABhAmK7aAeHbH7V55EBpLOQcpkYhWZNiMMbVsG314TM_tE9UGM8B99FmrWUqCqwMcsGwDDWK7B-uHcDD5nJxQhgV5SlMnS0IVE18Bdu4zqIzT3ZS2sgGf9Drti4P5Qkso3v1pW_fBzq-Mrd6_O7cvwF7FlRc95tOXSjjS0Woc70HGNBNd1kc0ZR9NuwV9TnvPRWbuoYu3HXz65DeWmbGaLOFdHAOUARr1fD9DL9LRDmeAHOGkYkplz9pSbWXR6vYkIInqFnvQKuwhnVOltaWa6_VG3BH0oc9T4xAZdH83DsG6eHtJlitVhH8Sx_PBfukG4x0S1qsmIWUPDwZhwUe55ly0I5ISELLL8Z3tAJpq3zrdyV6CbwOjF7-nPF7aRNuxg': No schema supplied. Perhaps you meant http://Ldk64CcSIXJkCs8pPt2YjpLkZJhaKs_m0pmekzL5SOY-1672941544-0-AaBQfP_66yID5nRb-xUznbW2FVXAWeiOcKnTosB_FUSxeeNSucukyHbo7OxAcJttrfA4pKDGasC8MvQz6o0cFBZv0VU_RfYXUn7Z6iVg5eVp7n_O4P9Zzk0IiE132EMNR-Xn0_gEfYM8DMCX5lS4yEgrs9hwhdJIWzS6N3fsDsa3kDjIH9oELTaDEbTbFvUXEkx3212-4NJ6SwCvfUzhtolsD7xJoswFQHjNBFrmUgScEORQpIWTWxzHPvpGTxepQMUPuAEbgNNykNbdp9oyLMDwmUnIqU7hSmeCkYU1RWlbxh95rcgH-yvV9mm3RQnIXT3WfcUE9lM5crnbBcplVCA4jbLP7kk1tu_BFbh-6gstCr0B24gEE5zJ41WGxwTbABhAmK7aAeHbH7V55EBpLOQcpkYhWZNiMMbVsG314TM_tE9UGM8B99FmrWUqCqwMcsGwDDWK7B-uHcDD5nJxQhgV5SlMnS0IVE18Bdu4zqIzT3ZS2sgGf9Drti4P5Qkso3v1pW_fBzq-Mrd6_O7cvwF7FlRc95tOXSjjS0Woc70HGNBNd1kc0ZR9NuwV9TnvPRWbuoYu3HXz65DeWmbGaLOFdHAOUARr1fD9DL9LRDmeAHOGkYkplz9pSbWXR6vYkIInqFnvQKuwhnVOltaWa6_VG3BH0oc9T4xAZdH83DsG6eHtJlitVhH8Sx_PBfukG4x0S1qsmIWUPDwZhwUe55ly0I5ISELLL8Z3tAJpq3zrdyV6CbwOjF7-nPF7aRNuxg?
191 +Trying to fetch pmid 33172971
192 +Trying genericCitationLabelled
193 +Trying pubmed_central_v2
194 +Trying acsPublications
195 +Trying uchicagoPress
196 +Trying nejm
197 +Trying futureMedicine
198 +Trying science_direct
199 +** fetching of reprint 33172971 failed from error Invalid URL 'cIp5BepthG7srw3ecaG_06Qhx8PrpoP6WpFwlAopfrE-1672941545-0-AXmWkR9H8dN8IxbsllbsHZ1SigvIVLyZ0euRPz15XW6nX_MsA3Y9dPoL0MovKxj_yUNiDnSYrVSmYNzVo-LEANJZq45ZpzDVv1GFU1qNu0PpI-0YxuWHz4dSudrD_soFz3LsTCtgLamU66ZDSsrVNGTaqqbajetYgnnhu4K-BeYnLmpOxzcMzYU9mgynHjFv0NnrdUU75kJPeOIRpgrUlqm8JRnkMq0SEvI2IPTDW0ToohbWs4bLvLX0GNKGVT_v5to_am4hEVPC9jmkfkkNOLoMmfbnZC-L2EGAKufwZgz17d89HWfaK61no8EW8y5ysZ5A9yTRfN__C_LpTG6FWw2HWyR9FgIvz799f4ysIoz52azp4a7w3G3AHCWdUBUDy6gabo_psIE4mu3dCHLcDzGNO148UT5wzxTfrQV3aatPAWjnaK6-Re0XOkABNINniMLfF6Ti-0WgY-cHyLH2RgKISy_89MeNrVJy22GToy2c_LQwZN3RT3M8M2TFXLXmi9xEE4Z_4kSRA_aRnvRjKJdMJfxhc-BYW1G-dn2SDAetNZZL7HcJW6cGlAIjNWQqTD9ieGfLxGJe0OCLysFkeY3XwRY5vTHQ-xVI-gGKBY0A9gS70DH5t_pS53fBTQZ1pK667ct-BCo5aysuQHLcXrlE9coo8k8vtQKrmQ5-Fxp2ZNV_MKLY5yqBj5yAWJI5b_O-Mp5TyE9Zzyte_cTXqYtO14DAr1ev8TwqZP3YNbunHBcvIO20uVjbNxc4m--ARi5MMuxcDg4Kvju8Dbf3YKM': No schema supplied. Perhaps you meant http://cIp5BepthG7srw3ecaG_06Qhx8PrpoP6WpFwlAopfrE-1672941545-0-AXmWkR9H8dN8IxbsllbsHZ1SigvIVLyZ0euRPz15XW6nX_MsA3Y9dPoL0MovKxj_yUNiDnSYrVSmYNzVo-LEANJZq45ZpzDVv1GFU1qNu0PpI-0YxuWHz4dSudrD_soFz3LsTCtgLamU66ZDSsrVNGTaqqbajetYgnnhu4K-BeYnLmpOxzcMzYU9mgynHjFv0NnrdUU75kJPeOIRpgrUlqm8JRnkMq0SEvI2IPTDW0ToohbWs4bLvLX0GNKGVT_v5to_am4hEVPC9jmkfkkNOLoMmfbnZC-L2EGAKufwZgz17d89HWfaK61no8EW8y5ysZ5A9yTRfN__C_LpTG6FWw2HWyR9FgIvz799f4ysIoz52azp4a7w3G3AHCWdUBUDy6gabo_psIE4mu3dCHLcDzGNO148UT5wzxTfrQV3aatPAWjnaK6-Re0XOkABNINniMLfF6Ti-0WgY-cHyLH2RgKISy_89MeNrVJy22GToy2c_LQwZN3RT3M8M2TFXLXmi9xEE4Z_4kSRA_aRnvRjKJdMJfxhc-BYW1G-dn2SDAetNZZL7HcJW6cGlAIjNWQqTD9ieGfLxGJe0OCLysFkeY3XwRY5vTHQ-xVI-gGKBY0A9gS70DH5t_pS53fBTQZ1pK667ct-BCo5aysuQHLcXrlE9coo8k8vtQKrmQ5-Fxp2ZNV_MKLY5yqBj5yAWJI5b_O-Mp5TyE9Zzyte_cTXqYtO14DAr1ev8TwqZP3YNbunHBcvIO20uVjbNxc4m--ARi5MMuxcDg4Kvju8Dbf3YKM?
200 +Trying to fetch pmid 34428301
201 +Trying genericCitationLabelled
202 +Trying pubmed_central_v2
203 +** fetching reprint using the 'pubmed central' finder...
204 +** fetching of reprint 34428301 succeeded
205 +Trying to fetch pmid 34791440
206 +Trying genericCitationLabelled
207 +Trying pubmed_central_v2
208 +** fetching reprint using the 'pubmed central' finder...
209 +** fetching of reprint 34791440 succeeded
210 +Trying to fetch pmid 9140061
211 +Trying genericCitationLabelled
212 +Trying pubmed_central_v2
213 +Trying acsPublications
214 +Trying uchicagoPress
215 +Trying nejm
216 +Trying futureMedicine
217 +Trying science_direct
218 +** fetching of reprint 9140061 failed from error Invalid URL 'jcZKP5wy_U5fZwQzMnXsls3TZuFKtLfM.TRt.Bh4d9k-1672941556-0-ARg_NFIhKoWkMSTkM9K0NNsEvXccNV6TpzvRoQ-2vYjK8XkBbOYBTbXDk7ayo2JehzHWQXv5Q_R2fac_6YNLMbXVLvOx_MPE2G55FZnUH5eyYoAVkc294_DbWF4BkOBr9bbRZ77KShHUYqJjAOi2O7mvSeGRhr8aCrq258YcVJ0FBdP8Q5tuy2CNWxi_udpInouGKC_Bnbb4D6LtrmOH2qchHRdKNei5ina55N2xPiH6jVDZ21jK0SkCSagtetSHnT7A-CfaFwqG5cz5lnOs1l1bBFEcOFdNNkmvz5yGZK-RR1-gynCmgS1ixfHapDjmCyogIfAxI1oumhPQoHCCg8-OqSgMSXHbgJdPWvc5L68Unmk5BAZNeFU2F_-xInoVtpYPwJNkeyldxj98PbHPAYg-SqmRtv0MyKm9qcEZJIULlfwTZ2ZGAm_uAwcQ7fW_O9VfUNBlbt2SohoYWfCtILAc2Imgon6vNbdisaxRkf70SZuD0G-Fj2SCsAYhkQrqPCdJAEEfWJ1QiddGb32kTSnXCoupFAWbX441Xj4nOj5OaRem_6JScd2AJp-YxSNI0Nm4IrB8s5O_lG1o_BDYlplFwbKozatP9ckn0jeXx38wInIuKOjUgl9B_T2Xvkg6sCNxXUWsHXiHMkhQ3x2AEh47zf4T6vQoTi0wNMkUVtkNTh8gOviKKl74Pi4m3yyq1ICnA9L9D6E6MLuE_ZOfmVBM79sVEgN8jsDBojevmYv96r09rQaQ_9c5cFZk7E25jQ': No schema supplied. Perhaps you meant http://jcZKP5wy_U5fZwQzMnXsls3TZuFKtLfM.TRt.Bh4d9k-1672941556-0-ARg_NFIhKoWkMSTkM9K0NNsEvXccNV6TpzvRoQ-2vYjK8XkBbOYBTbXDk7ayo2JehzHWQXv5Q_R2fac_6YNLMbXVLvOx_MPE2G55FZnUH5eyYoAVkc294_DbWF4BkOBr9bbRZ77KShHUYqJjAOi2O7mvSeGRhr8aCrq258YcVJ0FBdP8Q5tuy2CNWxi_udpInouGKC_Bnbb4D6LtrmOH2qchHRdKNei5ina55N2xPiH6jVDZ21jK0SkCSagtetSHnT7A-CfaFwqG5cz5lnOs1l1bBFEcOFdNNkmvz5yGZK-RR1-gynCmgS1ixfHapDjmCyogIfAxI1oumhPQoHCCg8-OqSgMSXHbgJdPWvc5L68Unmk5BAZNeFU2F_-xInoVtpYPwJNkeyldxj98PbHPAYg-SqmRtv0MyKm9qcEZJIULlfwTZ2ZGAm_uAwcQ7fW_O9VfUNBlbt2SohoYWfCtILAc2Imgon6vNbdisaxRkf70SZuD0G-Fj2SCsAYhkQrqPCdJAEEfWJ1QiddGb32kTSnXCoupFAWbX441Xj4nOj5OaRem_6JScd2AJp-YxSNI0Nm4IrB8s5O_lG1o_BDYlplFwbKozatP9ckn0jeXx38wInIuKOjUgl9B_T2Xvkg6sCNxXUWsHXiHMkhQ3x2AEh47zf4T6vQoTi0wNMkUVtkNTh8gOviKKl74Pi4m3yyq1ICnA9L9D6E6MLuE_ZOfmVBM79sVEgN8jsDBojevmYv96r09rQaQ_9c5cFZk7E25jQ?
219 +Trying to fetch pmid 32662815
220 +Trying genericCitationLabelled
221 +Trying pubmed_central_v2
222 +** fetching reprint using the 'pubmed central' finder...
223 +** fetching of reprint 32662815 succeeded
224 +Trying to fetch pmid 32817380
225 +Trying genericCitationLabelled
226 +Trying pubmed_central_v2
227 +Trying acsPublications
228 +Trying uchicagoPress
229 +Trying nejm
230 +Trying futureMedicine
231 +Trying science_direct
232 +** fetching of reprint 32817380 failed from error Invalid URL '12Kdq1Gu6s3H_c9wV7MWTwYP1d4sz7FRpWv08o2fehs-1672941561-0-AQzKoSKCDX92o3mW4XorQJa2qF1s8dsMn24r1239tDd-OxIEJ-xofWlfZb7cmDWmkZ-d4uCYOdMgyimJ9BwqBkuJKbRguJ_HaG4KzuT0CwTAflqmSgiP6oaZRbxRIMOl3LAnhQXawFffYKLbyEKG9hEWBeEbs31LlzwG7k7IbodBBPNfYicYC2QJy8RZ5xHWPTXxcwshhdG__QByEK9fJ6RYaR8LVhOwXo-m6nKcnmcvdFAubYorAvVvggpPCiIA0EYouK_-KA_Et9mXMtoRPVhEKeO03k9LAejSpvDDd8praPe4uYMGyBe4ruFtFbjqOdJgmlwSt_hPsHu_iFLkl6eW-V_dW5iwEQOE9z1jSjKf1ZHznUnde5Nzlh3v0wV2po1Y1QuFKuy8_IO-DB4iU3MlzHKgWqCsAeLorSaui7KqJAGzqmM3Keurq7J4URVd8khAGmHXMZHt3u96krRlFp3Nsc1_jwJEKKLxr44FVFla7XnqlQIHXdzj9FffjdPd1R_p3G-UEYGLzL32dFulkql4INTbOR625BrjoAvw74XDQRcNE_P72PYyCRUSIarPTtFTQBMSfpxRaOprcTZfMR_U5zdY0uGixU3srbPeduCUA7tQOFiCiLoTD_odsa75NYCv9o_me1vJSA823Md4hCV947suGwjybNaQP-R-yrffAfni7dQRYMt-mjEHk5LtnebhpJi1G44UN_WFSDpOkB6lvKO7Qc-eoUXnm4DbeysVDTRAmVi94HkcG9tc1U7BeSVyXNUfq3C1Vr_1jJXCgUI': No schema supplied. Perhaps you meant http://12Kdq1Gu6s3H_c9wV7MWTwYP1d4sz7FRpWv08o2fehs-1672941561-0-AQzKoSKCDX92o3mW4XorQJa2qF1s8dsMn24r1239tDd-OxIEJ-xofWlfZb7cmDWmkZ-d4uCYOdMgyimJ9BwqBkuJKbRguJ_HaG4KzuT0CwTAflqmSgiP6oaZRbxRIMOl3LAnhQXawFffYKLbyEKG9hEWBeEbs31LlzwG7k7IbodBBPNfYicYC2QJy8RZ5xHWPTXxcwshhdG__QByEK9fJ6RYaR8LVhOwXo-m6nKcnmcvdFAubYorAvVvggpPCiIA0EYouK_-KA_Et9mXMtoRPVhEKeO03k9LAejSpvDDd8praPe4uYMGyBe4ruFtFbjqOdJgmlwSt_hPsHu_iFLkl6eW-V_dW5iwEQOE9z1jSjKf1ZHznUnde5Nzlh3v0wV2po1Y1QuFKuy8_IO-DB4iU3MlzHKgWqCsAeLorSaui7KqJAGzqmM3Keurq7J4URVd8khAGmHXMZHt3u96krRlFp3Nsc1_jwJEKKLxr44FVFla7XnqlQIHXdzj9FffjdPd1R_p3G-UEYGLzL32dFulkql4INTbOR625BrjoAvw74XDQRcNE_P72PYyCRUSIarPTtFTQBMSfpxRaOprcTZfMR_U5zdY0uGixU3srbPeduCUA7tQOFiCiLoTD_odsa75NYCv9o_me1vJSA823Md4hCV947suGwjybNaQP-R-yrffAfni7dQRYMt-mjEHk5LtnebhpJi1G44UN_WFSDpOkB6lvKO7Qc-eoUXnm4DbeysVDTRAmVi94HkcG9tc1U7BeSVyXNUfq3C1Vr_1jJXCgUI?
233 +Trying to fetch pmid 32849447
234 +Trying genericCitationLabelled
235 +Trying pubmed_central_v2
236 +** fetching reprint using the 'pubmed central' finder...
237 +** fetching of reprint 32849447 succeeded
238 +Trying to fetch pmid 33068046
239 +Trying genericCitationLabelled
240 +Trying pubmed_central_v2
241 +Trying acsPublications
242 +Trying uchicagoPress
243 +Trying nejm
244 +Trying futureMedicine
245 +Trying science_direct
246 +** fetching of reprint 33068046 failed from error Invalid URL 'Ca2tz3FARdmZBsByuyNuWeiql2uee1VreT.kVjY7yrk-1672941567-0-AexEfZqeSZGpnAKuvb4N24mFbbARpMqS4Bl7rq2oJaJLQy4XNqEXY1SvQ53OXwzuh9s8hJpJSmZKZ90s8So4WTMitRZFt0iwKRvwq5PfF8ZF-spmYvUyZmqSAcRty7hyAnlIItHCbvd0DXymu2foqGLiY7_Azyn4oIZjqDWZgwUu4cttCsPTlTJtscKhrnIDiTC2AD-6BrcAHq2eFMQXn27imPIx1RCRlJshGeDr1vbtfjlBg89wEfvUQMpUEgz-xVlFP2tkES_AqE3RIqDBCDIDkDuwxhKZ5d-k_PxAuN3Vbx-1nlLI7WeIZH3b-qHkPWg8ifOx6RsMU_A02ZEHMrjlftm66SFQ60Wsria5dpTeLxvGd34BBngLodgDKaYoG0ztHkPImcz4lT76J7-QCgKcV7O86u_4mEpHhONMbCRBLtVhcFVAX-zAMIyOWzECJ6x0Sau9cAqssr2l_Q1VT-f4uCaFA5KpmuC3IHUZQABkrvM9nh0uOhB2e7ln9OfxBG89KhjhGPRhio2LRDY4yprcBdzS-dNl1pedPEXENepuOg0R645bq0poGP4uKeYHuQ': No schema supplied. Perhaps you meant http://Ca2tz3FARdmZBsByuyNuWeiql2uee1VreT.kVjY7yrk-1672941567-0-AexEfZqeSZGpnAKuvb4N24mFbbARpMqS4Bl7rq2oJaJLQy4XNqEXY1SvQ53OXwzuh9s8hJpJSmZKZ90s8So4WTMitRZFt0iwKRvwq5PfF8ZF-spmYvUyZmqSAcRty7hyAnlIItHCbvd0DXymu2foqGLiY7_Azyn4oIZjqDWZgwUu4cttCsPTlTJtscKhrnIDiTC2AD-6BrcAHq2eFMQXn27imPIx1RCRlJshGeDr1vbtfjlBg89wEfvUQMpUEgz-xVlFP2tkES_AqE3RIqDBCDIDkDuwxhKZ5d-k_PxAuN3Vbx-1nlLI7WeIZH3b-qHkPWg8ifOx6RsMU_A02ZEHMrjlftm66SFQ60Wsria5dpTeLxvGd34BBngLodgDKaYoG0ztHkPImcz4lT76J7-QCgKcV7O86u_4mEpHhONMbCRBLtVhcFVAX-zAMIyOWzECJ6x0Sau9cAqssr2l_Q1VT-f4uCaFA5KpmuC3IHUZQABkrvM9nh0uOhB2e7ln9OfxBG89KhjhGPRhio2LRDY4yprcBdzS-dNl1pedPEXENepuOg0R645bq0poGP4uKeYHuQ?
247 +Trying to fetch pmid 33072717
248 +Trying genericCitationLabelled
249 +Trying pubmed_central_v2
250 +** fetching reprint using the 'pubmed central' finder...
251 +** fetching of reprint 33072717 succeeded
252 +Trying to fetch pmid 33136147
253 +Trying genericCitationLabelled
254 +** fetching reprint using the 'generic citation labelled' finder...
255 +** fetching of reprint 33136147 succeeded
256 +Trying to fetch pmid 33318048
257 +Trying genericCitationLabelled
258 +Trying pubmed_central_v2
259 +Trying acsPublications
260 +Trying uchicagoPress
261 +Trying nejm
262 +Trying futureMedicine
263 +Trying science_direct
264 +** fetching of reprint 33318048 failed from error Invalid URL 'H_HDOMjPFqBRBn9CIvFp3.MLyt6.Cr1yqjPqy7_.dNo-1672941585-0-ASGJyet_JMjh-n9RCjZP1usTaU-rAh_oVPlNBV8Ox06oZLjLmr4nLazOPGibTSzbDun4wRRfxjJD1cl8pFLvWgZNLCwgScfdMEuTEYcHelG8wh84ZPO7-PimWyY4a-Ax_JW2wfMsWOrdcFRmKRfdjpL4MFDyEGVMcjhzP9y84LW4EnDyNZqVSkX_y8VAxIbmaMeuS-EiSakyeV1RnV4_bKjzzuQXXLtk8wIhc-rF2VoAiFgTRP3-kR7Y02rLN1opo-OYhoQ29Xy2fAHIKm2pS-qBW0XNRWiOU6q8_YMmMZrWbskiukxzgyZO5MutUF8ygYDuzaZDjX0BtuezjJEtcKWslPbaM1gXj1L8Yy3U7YCwi-_CPUjNOrvFnW0EEm5jeKDUVwwIeY1-sd54wUjlnn86c6qAqpaI4unKjLk4makfoIUlKr4B62VwsTRnrfbZxbqDTyl5jZjFIGiHmrmzPXxt1QG7SrQwApYoGQYFiijEUTw-7IM3t7bXcwRYTMVfbXUEhv8JrzvShSa8x1fDEwHgU2fUnY6BoCOrpC9hZShy1xlZSOOpw4AHgCW272GoFIr2PJ-Zy1UNze2TXebaUegtUpleiM-BhsDhqCAaGxAj9SQsD153z7wtiM6kCMnHOz9IhaKIkgYpKYgXwcQmuzLZUWgWJFJ0lqYeSgvlKAgHjzRY_3Jt2gPT3L2GcgUZQXWWRx4Hs4jL2tUvAiOuqPfvPWSFVjGTZPjZCd3VVFrqpcZCh2v85PiksdgNk05aMA': No schema supplied. Perhaps you meant http://H_HDOMjPFqBRBn9CIvFp3.MLyt6.Cr1yqjPqy7_.dNo-1672941585-0-ASGJyet_JMjh-n9RCjZP1usTaU-rAh_oVPlNBV8Ox06oZLjLmr4nLazOPGibTSzbDun4wRRfxjJD1cl8pFLvWgZNLCwgScfdMEuTEYcHelG8wh84ZPO7-PimWyY4a-Ax_JW2wfMsWOrdcFRmKRfdjpL4MFDyEGVMcjhzP9y84LW4EnDyNZqVSkX_y8VAxIbmaMeuS-EiSakyeV1RnV4_bKjzzuQXXLtk8wIhc-rF2VoAiFgTRP3-kR7Y02rLN1opo-OYhoQ29Xy2fAHIKm2pS-qBW0XNRWiOU6q8_YMmMZrWbskiukxzgyZO5MutUF8ygYDuzaZDjX0BtuezjJEtcKWslPbaM1gXj1L8Yy3U7YCwi-_CPUjNOrvFnW0EEm5jeKDUVwwIeY1-sd54wUjlnn86c6qAqpaI4unKjLk4makfoIUlKr4B62VwsTRnrfbZxbqDTyl5jZjFIGiHmrmzPXxt1QG7SrQwApYoGQYFiijEUTw-7IM3t7bXcwRYTMVfbXUEhv8JrzvShSa8x1fDEwHgU2fUnY6BoCOrpC9hZShy1xlZSOOpw4AHgCW272GoFIr2PJ-Zy1UNze2TXebaUegtUpleiM-BhsDhqCAaGxAj9SQsD153z7wtiM6kCMnHOz9IhaKIkgYpKYgXwcQmuzLZUWgWJFJ0lqYeSgvlKAgHjzRY_3Jt2gPT3L2GcgUZQXWWRx4Hs4jL2tUvAiOuqPfvPWSFVjGTZPjZCd3VVFrqpcZCh2v85PiksdgNk05aMA?
265 +```
266 +
267 +# Text extraction from PDF
268 +We sent PDf files to Lisen&Curate team for extracting text.
269 +
1 +21097887
2 +23818864
3 +24947454
4 +25222563
5 +25275371
6 +25735747
7 +26258987
8 +26279566
9 +26670385
10 +26673755
11 +28061857
12 +28526842
13 +29394395
14 +30137486
15 +30389436
16 +30420454
17 +33172971
18 +34428301
19 +34791440
20 +9140061
21 +32662815
22 +32817380
23 +32849447
24 +33068046
25 +33072717
26 +33136147
27 +33318048