cmendezc

Gathering input document collection

import stanza
import argparse
import re
import os
import pandas as pd
# Objective
# Sentences extraction from XML Soft files.
#
# Input parameters
# --inputPath=PATH Path to XML Soft files
# --outputPath=PATH Path to place output files
#
# Output
# Files with sentences obtained from XML Soft files
#
# Examples
# python extract-sentences-from-softfiles.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
#
# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = argparse.ArgumentParser(
prog='extract-sentences-from-softfiles',
description='Sentences extraction from XML Soft files.',
epilog='')
parser.add_argument("--inputPath", dest="inputPath",
help="Path to XML Soft files", metavar="PATH")
parser.add_argument("--outputPath", dest="outputPath",
help="Path for output files", metavar="PATH")
args = parser.parse_args()
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to XML Soft files: " + args.inputPath)
print("Path to output files: " + args.outputPath)
print('-------------------------------- PROCESSING --------------------------------')
## Tags of GCs into consideration
# culture medium, medium supplements, aeration, temperature,
# pH, agitation, growth phase, optical density, genetic background
tags = {
'<Gtype>': 'Gtype',
# '<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Phase>': 'Phase',
# '<Substrain>': 'Substrain',
'<Supp>': 'Supp',
# '<Strain>': 'Strain',
# '<Technique>': 'Technique',
'<Temp>': 'Temp',
'<OD>': 'OD',
'<Anti>': 'Anti',
'<Agit>': 'Agit',
'<Air>': 'Air',
'<Vess>': 'Vess',
'<pH>': 'pH'
}
#tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
# '<Temp>', '<OD>', '<Anti>', '<Agit>',
# '<Air>', '<Vess>', '<pH>']
#deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
tags = ['Gtype', 'Med', 'Phase', 'Supp',
'Temp', 'OD', 'Anti', 'Agit',
'Air', 'Vess', 'pH']
deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique']
all_tags = tags + deleted_tags
# Regex to check if line has a tag
regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
# Regex to delete tags
regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
# Regex to substitute tags
regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
#p = re.compile(r'blue (?P<animal>dog|cat)')
#p.sub(r'gray \g<animal>', s)
# Regex to tag GCs
regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
# Testing file: GSE54899_family_retagged-05242019_validated.xml
testing_file = "GSE54899_family_retagged-05242019_validated.xml"
# Define stanza pipeline for sentence segmentation
nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
# Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
# Store field_name (bangline) and field_text
field_name = ""
field_text = ""
# Store list of unique field_name
hash_field_name = {}
# Store sentences from fields that contained at least one GC tag.
# We want to use this list for someone to check it
df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
# Store serie number
# ^SERIES = GSE54899
serie = ""
# Store series pubmed id
# !Series_pubmed_id = 25222563
serie_pubmed_id = ""
# Store sample
# ^SAMPLE = GSM1326335
sample = ""
for path, dirs, files in os.walk(args.inputPath):
# For each file in dir
for file in files:
if file == testing_file:
print(" Reading file..." + str(file))
with open(os.path.join(args.inputPath, file)) as iFile:
for line in iFile:
line = line.rstrip('\n')
if line.find(" = ") == -1:
continue
list_line = line.split(" = ")
field_name = list_line[0]
#print("field_name: {}".format(field_name))
field_text = list_line[1]
#print("field_text: {}".format(field_text))
if field_name == "^SERIES":
serie = field_text
elif field_name == "!Series_pubmed_id":
serie_pubmed_id = field_text
elif field_name == "^SAMPLE":
sample = field_text
elif regex_has_tag.search(line): # Contains GC tag
if field_name in hash_field_name:
hash_field_name[field_name] += 1
else:
hash_field_name[field_name] = 1
original_sentence = field_text
# delete GC tags
modified_sentence = regex_delete_tag.sub("", field_text)
modified_sentence = regex_delete_tag.sub("", modified_sentence)
# substitute tags
# p = re.compile(r'blue (?P<animal>dog|cat)')
# p.sub(r'gray \g<animal>', s)
modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
doc = nlp(modified_sentence)
for i, sentence in enumerate(doc.sentences):
# print(sentence.text)
list_transformed_sentence = []
# For GC tag
gc_tag = "O"
in_tag = False
for word in sentence.words:
result = regex_gc_ini_tag.match(word.text)
if result:
gc_tag = result.group("tag")
in_tag = True
continue
else:
result = regex_gc_end_tag.match(word.text)
if result:
gc_tag = "O"
in_tag = False
continue
else:
if not in_tag:
gc_tag = "O"
list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
transformed_sentence = " ".join(list_transformed_sentence)
new_row = {'serie': serie,
'serie_pubmed_id': serie_pubmed_id,
'sample': sample,
'field_name': field_name,
'original_sentence': original_sentence,
'modified_sentence': sentence.text,
'transformed_sentence': transformed_sentence}
df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
#print(token)
quit()
## End of tagging
out_labels = {
'</Gtype>': 'O',
'</Gversion>': 'O',
'</Med>': 'O',
'</Phase>': 'O',
'</Substrain>': 'O',
'</Supp>': 'O',
'</Strain>': 'O',
'</Technique>': 'O',
'</Temp>': 'O',
'</OD>': 'O',
'</Anti>': 'O',
'</Agit>': 'O',
'</Air>': 'O',
'</Vess>': 'O',
'</pH>': 'O'}
old_labels = {
'<Orgn>': 'O',
'</Orgn>': 'O'
}
# Other label
flag = 'O'
lista = []
# First sentence
sentence = ''
n = 0
with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w in in_labels or w in out_labels:
# Tagging
if w in in_labels.keys(): flag = in_labels[w]
if w in out_labels: flag = out_labels[w]
else:
if w == "PGCGROWTHCONDITIONS":
n = n + 1
words = sentence.split(' ')
# End of sentence
tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()]
# At least one true-tag on sentence
if len(tags) > 0:
lista.append(sentence)
# New setence
sentence = ''
elif w not in old_labels.keys():
# Building and save tagging sentence
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ')
print("Number of sentences with at least one tag: " + str(len(lista)))
print("Number of sentences from CoreNLP: " + str(n))
# Split 70 30 training and test sentences
trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70))
testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
print("Number of sentences for training: " + str(len(trainingIndex)))
print("Number of sentences for test: " + str(len(testIndex)))
with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile:
Data = [lista[i] for i in trainingIndex]
oFile.write('\n'.join(Data))
with open(os.path.join(args.outputPath, args.testFile), "w") as oFile:
Data = [lista[i] for i in testIndex]
oFile.write('\n'.join(Data))
print("==================================END===================================")
import stanza
import argparse
import re
import os
import pandas as pd
# Objective
# Sentences extraction from XML Soft files.
#
# Input parameters
# --inputPath=PATH Path to XML Soft files
# --outputPath=PATH Path to place output files
#
# Output
# Files with sentences obtained from XML Soft files
#
# Examples
# python extract-sentences-from-softfiles.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
#
# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = argparse.ArgumentParser(
prog='extract-sentences-from-softfiles',
description='Sentences extraction from XML Soft files.',
epilog='')
parser.add_argument("--inputPath", dest="inputPath",
help="Path to XML Soft files", metavar="PATH")
parser.add_argument("--outputPath", dest="outputPath",
help="Path for output files", metavar="PATH")
args = parser.parse_args()
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to XML Soft files: " + args.inputPath)
print("Path to output files: " + args.outputPath)
print('-------------------------------- PROCESSING --------------------------------')
## Tags of GCs into consideration
# culture medium, medium supplements, aeration, temperature,
# pH, agitation, growth phase, optical density, genetic background
tags = {
'<Gtype>': 'Gtype',
# '<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Phase>': 'Phase',
# '<Substrain>': 'Substrain',
'<Supp>': 'Supp',
# '<Strain>': 'Strain',
# '<Technique>': 'Technique',
'<Temp>': 'Temp',
'<OD>': 'OD',
'<Anti>': 'Anti',
'<Agit>': 'Agit',
'<Air>': 'Air',
'<Vess>': 'Vess',
'<pH>': 'pH'
}
#tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
# '<Temp>', '<OD>', '<Anti>', '<Agit>',
# '<Air>', '<Vess>', '<pH>']
#deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
tags = ['Gtype', 'Med', 'Phase', 'Supp',
'Temp', 'OD', 'Anti', 'Agit',
'Air', 'Vess', 'pH']
deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique']
all_tags = tags + deleted_tags
# Regex to check if line has a tag
regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
# Regex to delete tags
regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
# Regex to substitute tags
regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
#p = re.compile(r'blue (?P<animal>dog|cat)')
#p.sub(r'gray \g<animal>', s)
# Regex to tag GCs
regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
# Testing file: GSE54899_family_retagged-05242019_validated.xml
testing_file = "GSE54899_family_retagged-05242019_validated.xml"
# Define stanza pipeline for sentence segmentation
nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
# Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
# Store field_name (bangline) and field_text
field_name = ""
field_text = ""
# Store list of unique field_name
hash_field_name = {}
# Store sentences from fields that contained at least one GC tag.
# We want to use this list for someone to check it
df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
# Store serie number
# ^SERIES = GSE54899
serie = ""
# Store series pubmed id
# !Series_pubmed_id = 25222563
serie_pubmed_id = ""
# Store sample
# ^SAMPLE = GSM1326335
sample = ""
for path, dirs, files in os.walk(args.inputPath):
# For each file in dir
for file in files:
if file == testing_file:
print(" Reading file..." + str(file))
with open(os.path.join(args.inputPath, file)) as iFile:
for line in iFile:
line = line.rstrip('\n')
if line.find(" = ") == -1:
continue
list_line = line.split(" = ")
field_name = list_line[0]
#print("field_name: {}".format(field_name))
field_text = list_line[1]
#print("field_text: {}".format(field_text))
if field_name == "^SERIES":
serie = field_text
elif field_name == "!Series_pubmed_id":
serie_pubmed_id = field_text
elif field_name == "^SAMPLE":
sample = field_text
elif regex_has_tag.search(line): # Contains GC tag
if field_name in hash_field_name:
hash_field_name[field_name] += 1
else:
hash_field_name[field_name] = 1
original_sentence = field_text
# delete GC tags
modified_sentence = regex_delete_tag.sub("", field_text)
modified_sentence = regex_delete_tag.sub("", modified_sentence)
# substitute tags
# p = re.compile(r'blue (?P<animal>dog|cat)')
# p.sub(r'gray \g<animal>', s)
modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
doc = nlp(modified_sentence)
for i, sentence in enumerate(doc.sentences):
# print(sentence.text)
list_transformed_sentence = []
# For GC tag
gc_tag = "O"
in_tag = False
for word in sentence.words:
result = regex_gc_ini_tag.match(word.text)
if result:
gc_tag = result.group("tag")
in_tag = True
continue
else:
result = regex_gc_end_tag.match(word.text)
if result:
gc_tag = "O"
in_tag = False
continue
else:
if not in_tag:
gc_tag = "O"
list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
transformed_sentence = " ".join(list_transformed_sentence)
new_row = {'serie': serie,
'serie_pubmed_id': serie_pubmed_id,
'sample': sample,
'field_name': field_name,
'original_sentence': original_sentence,
'modified_sentence': sentence.text,
'transformed_sentence': transformed_sentence}
df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
#print(token)
quit()
## End of tagging
out_labels = {
'</Gtype>': 'O',
'</Gversion>': 'O',
'</Med>': 'O',
'</Phase>': 'O',
'</Substrain>': 'O',
'</Supp>': 'O',
'</Strain>': 'O',
'</Technique>': 'O',
'</Temp>': 'O',
'</OD>': 'O',
'</Anti>': 'O',
'</Agit>': 'O',
'</Air>': 'O',
'</Vess>': 'O',
'</pH>': 'O'}
old_labels = {
'<Orgn>': 'O',
'</Orgn>': 'O'
}
# Other label
flag = 'O'
lista = []
# First sentence
sentence = ''
n = 0
with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w in in_labels or w in out_labels:
# Tagging
if w in in_labels.keys(): flag = in_labels[w]
if w in out_labels: flag = out_labels[w]
else:
if w == "PGCGROWTHCONDITIONS":
n = n + 1
words = sentence.split(' ')
# End of sentence
tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()]
# At least one true-tag on sentence
if len(tags) > 0:
lista.append(sentence)
# New setence
sentence = ''
elif w not in old_labels.keys():
# Building and save tagging sentence
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ')
print("Number of sentences with at least one tag: " + str(len(lista)))
print("Number of sentences from CoreNLP: " + str(n))
# Split 70 30 training and test sentences
trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70))
testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
print("Number of sentences for training: " + str(len(trainingIndex)))
print("Number of sentences for test: " + str(len(testIndex)))
with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile:
Data = [lista[i] for i in trainingIndex]
oFile.write('\n'.join(Data))
with open(os.path.join(args.outputPath, args.testFile), "w") as oFile:
Data = [lista[i] for i in testIndex]
oFile.write('\n'.join(Data))
print("==================================END===================================")
# Input article collection
We used list of PMIDs from article collections delivered by curators (Víctor, Soco, Paloma).
DRIVE (https://docs.google.com/spreadsheets/d/1OayfQ7ODgnU4d5PQ3SUAmFX3Tc27PocCHZ6flPXwLKc/edit?usp=sharing)
Asana (https://app.asana.com/0/1200927210854847/1203428992254399/f)
# Download PDFs
We used [Pubmed-Batch-Download](https://github.com/billgreenwald/Pubmed-Batch-Download) tool to download PDF files.
## Installation
```shell
(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ git clone https://github.com/billgreenwald/Pubmed-Batch-Download.git
Cloning into 'Pubmed-Batch-Download'...
remote: Enumerating objects: 202, done.
remote: Counting objects: 100% (12/12), done.
remote: Compressing objects: 100% (12/12), done.
remote: Total 202 (delta 5), reused 0 (delta 0), pack-reused 190
Receiving objects: 100% (202/202), 31.23 MiB | 1.09 MiB/s, done.
Resolving deltas: 100% (102/102), done.
(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ mv Pubmed-Batch-Download/ github-Pubmed-Batch-Download
(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ cd github-Pubmed-Batch-Download/
(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ ls -l
total 52
-rw-rw-r-- 1 cmendezc cmendezc 72 ene 5 11:31 example_pmf.tsv
-rw-rw-r-- 1 cmendezc cmendezc 11430 ene 5 11:31 fetch_pdfs.py
-rw-rw-r-- 1 cmendezc cmendezc 18711 ene 5 11:31 fetch_pdfs_toScript.ipynb
-rw-rw-r-- 1 cmendezc cmendezc 551 ene 5 11:31 pubmed-batch-downloader-py3-windows.yml
-rw-rw-r-- 1 cmendezc cmendezc 895 ene 5 11:31 pubmed-batch-downloader-py3.yml
-rw-rw-r-- 1 cmendezc cmendezc 3667 ene 5 11:31 README.md
drwxrwxr-x 2 cmendezc cmendezc 4096 ene 5 11:31 ruby_version
-rw-rw-r-- 1 cmendezc cmendezc 0 ene 5 11:31 unfetched_pmids.tsv
(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda env create -f pubmed-batch-downloader-py3.yml
```
## Testing
Error!
```shell
(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda activate pubmed-batch-downloader-py3
(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ python fetch_pdfs.py -pmf example_pmf.tsv
Traceback (most recent call last):
File "fetch_pdfs.py", line 64, in <module>
from bs4 import BeautifulSoup
ModuleNotFoundError: No module named 'bs4'
```
Fix 1: Install bs4
```shell
(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ pip install bs4
Collecting bs4
Using cached https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from bs4)
Using cached https://files.pythonhosted.org/packages/9c/d8/909c4089dbe4ade9f9705f143c9f13f065049a9d5e7d34c828aefdd0a97c/beautifulsoup4-4.11.1-py3-none-any.whl
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
Using cached https://files.pythonhosted.org/packages/16/e3/4ad79882b92617e3a4a0df1960d6bce08edfb637737ac5c3f3ba29022e25/soupsieve-2.3.2.post1-py3-none-any.whl
Building wheels for collected packages: bs4
Building wheel for bs4 (setup.py) ... done
Stored in directory: /home/cmendezc/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.11.1 bs4-0.0.1 soupsieve-2.3.2.post1
```
Error!
```shell
(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ python fetch_pdfs.py -pmf example_pmf.tsv
Output directory of fetched_pdfs did not exist. Created the directory.
Trying to fetch pmid 27547345
** fetching of reprint 27547345 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Trying to fetch pmid 22610656
** fetching of reprint 22610656 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Trying to fetch pmid 23858657
** fetching of reprint 23858657 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Trying to fetch pmid 24998529
** fetching of reprint 24998529 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Trying to fetch pmid 27859194
** fetching of reprint 27859194 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Trying to fetch pmid 26991916
** fetching of reprint 26991916 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Trying to fetch pmid 26742956
** fetching of reprint 26742956 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Trying to fetch pmid 28388874
** fetching of reprint 28388874 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
```
Fix 2: Install
```shell
(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ pip install lxml
Collecting lxml
Downloading https://files.pythonhosted.org/packages/4b/24/300d0fd5130cf55e5bbab2c53d339728370cb4ac12ca80a4f421c2e228eb/lxml-4.9.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (5.8MB)
|████████████████████████████████| 5.8MB 2.7MB/s
Installing collected packages: lxml
Successfully installed lxml-4.9.2
```
It runs, but it didn't fetch all files. See unfetch_pmids.tsv
## Run
```shell
(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda activate pubmed-batch-downloader-py3
(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ cd /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/extraction-literature/input/pdfs
(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/gitlab-automatic-extraction-growth-conditions/extraction-literature/input/pdfs$ python /home/cmendezc/Documents/ccg/github-Pubmed-Batch-Download/fetch_pdfs.py -pmf ../list_of_PMIDs.txt
Output directory of fetched_pdfs did not exist. Created the directory.
Trying to fetch pmid 21097887
Trying genericCitationLabelled
Trying pubmed_central_v2
** fetching reprint using the 'pubmed central' finder...
** fetching of reprint 21097887 succeeded
Trying to fetch pmid 23818864
Trying genericCitationLabelled
Trying pubmed_central_v2
Trying acsPublications
Trying uchicagoPress
Trying nejm
Trying futureMedicine
Trying science_direct
** fetching of reprint 23818864 failed from error list index out of range
Trying to fetch pmid 24947454
Trying genericCitationLabelled
** fetching reprint using the 'generic citation labelled' finder...
** fetching of reprint 24947454 succeeded
Trying to fetch pmid 25222563
Trying genericCitationLabelled
** fetching reprint using the 'generic citation labelled' finder...
** fetching of reprint 25222563 succeeded
Trying to fetch pmid 25275371
Trying genericCitationLabelled
** fetching reprint using the 'generic citation labelled' finder...
** fetching of reprint 25275371 succeeded
Trying to fetch pmid 25735747
Trying genericCitationLabelled
Trying pubmed_central_v2
** fetching reprint using the 'pubmed central' finder...
** fetching of reprint 25735747 succeeded
Trying to fetch pmid 26258987
Trying genericCitationLabelled
** fetching reprint using the 'generic citation labelled' finder...
** fetching of reprint 26258987 succeeded
Trying to fetch pmid 26279566
Trying genericCitationLabelled
Trying pubmed_central_v2
Trying acsPublications
Trying uchicagoPress
Trying nejm
Trying futureMedicine
Trying science_direct
Trying direct_pdf_link
** Reprint 26279566 could not be fetched with the current finders.
Trying to fetch pmid 26670385
Trying genericCitationLabelled
Trying pubmed_central_v2
Trying acsPublications
Trying uchicagoPress
Trying nejm
Trying futureMedicine
Trying science_direct
** fetching of reprint 26670385 failed from error Invalid URL 'f680gMuZlnwT4304lwsG531xpi4vbk83nDntGZ4l27M-1672941516-0-AdC4bMbcOc9cCSwDp4lsqirHOW3zv1msNUw8lijeZIxdN3BDTBUy983qf_LAfBVEhkt4k2Xwu_NYMJeaq3oG4LTyfDBxN2Ra-cmmTDVIK66GtTB9oyyn4GqMem1PTBwVEFtzEYcB4AOoR8EGbwWrEZa1jPMupBq_gJ0JlxuIGbBJw3SuuioKmRlQT_TDXNREjT2Av3DHrrz6C008shr-pgrTtAoM5aZ0N4clcoBQ1FWX04MZm-nPOxI-2zbxcHUYXqV91lbH7iWkztZWPcv6-Q3ePiFD6_-C7pdY_Mf0Y670kOKyhoqlZ0m3PqPm64-37r-nzxrcd2Z0MWJUMC8Jx1b1OA1e53TJy62F2K5ws3U82zktr4gEDS11A13r8DIn1wRCEH2dk8jI02NQoIp3JBTvUixhiNkWib01Zl7l7iAFLOJtWlVbeUsOwCh6imfV5m-2No7-SiGaur5Ip6Zf3ACDki_CjXifHxtGVh1TbvnYsBeUdoaWV3TsXdGvF7AVr_ytXg4-JiIHhaZ-SdCzpe65bWZmvwrIpCfZOEBOC-gNTm3tq5h1_2iQzVTinGQonsXdwLCYSKQeZRQ-qEFf7y4PpesHamAXmw1OZlZJtKlFgXx9MoCBp0Irx8ChWyIo5RhSBoa9j1_JW8AX1x7KDY3UX32ItW7-a2Qw5IEL_FRS6cyXOg1FLeHlanntIl11kKWmXyJ86bsvEQBn2Q9-1kMvisDZaM0LrfNT9KcghdkLgdpzsDEf-B4_MKdkVAhkBQ': No schema supplied. Perhaps you meant http://f680gMuZlnwT4304lwsG531xpi4vbk83nDntGZ4l27M-1672941516-0-AdC4bMbcOc9cCSwDp4lsqirHOW3zv1msNUw8lijeZIxdN3BDTBUy983qf_LAfBVEhkt4k2Xwu_NYMJeaq3oG4LTyfDBxN2Ra-cmmTDVIK66GtTB9oyyn4GqMem1PTBwVEFtzEYcB4AOoR8EGbwWrEZa1jPMupBq_gJ0JlxuIGbBJw3SuuioKmRlQT_TDXNREjT2Av3DHrrz6C008shr-pgrTtAoM5aZ0N4clcoBQ1FWX04MZm-nPOxI-2zbxcHUYXqV91lbH7iWkztZWPcv6-Q3ePiFD6_-C7pdY_Mf0Y670kOKyhoqlZ0m3PqPm64-37r-nzxrcd2Z0MWJUMC8Jx1b1OA1e53TJy62F2K5ws3U82zktr4gEDS11A13r8DIn1wRCEH2dk8jI02NQoIp3JBTvUixhiNkWib01Zl7l7iAFLOJtWlVbeUsOwCh6imfV5m-2No7-SiGaur5Ip6Zf3ACDki_CjXifHxtGVh1TbvnYsBeUdoaWV3TsXdGvF7AVr_ytXg4-JiIHhaZ-SdCzpe65bWZmvwrIpCfZOEBOC-gNTm3tq5h1_2iQzVTinGQonsXdwLCYSKQeZRQ-qEFf7y4PpesHamAXmw1OZlZJtKlFgXx9MoCBp0Irx8ChWyIo5RhSBoa9j1_JW8AX1x7KDY3UX32ItW7-a2Qw5IEL_FRS6cyXOg1FLeHlanntIl11kKWmXyJ86bsvEQBn2Q9-1kMvisDZaM0LrfNT9KcghdkLgdpzsDEf-B4_MKdkVAhkBQ?
Trying to fetch pmid 26673755
Trying genericCitationLabelled
** fetching reprint using the 'generic citation labelled' finder...
** fetching of reprint 26673755 succeeded
Trying to fetch pmid 28061857
Trying genericCitationLabelled
** fetching reprint using the 'generic citation labelled' finder...
** fetching of reprint 28061857 succeeded
Trying to fetch pmid 28526842
Trying genericCitationLabelled
** fetching reprint using the 'generic citation labelled' finder...
** fetching of reprint 28526842 succeeded
Trying to fetch pmid 29394395
Trying genericCitationLabelled
Trying pubmed_central_v2
** fetching reprint using the 'pubmed central' finder...
** fetching of reprint 29394395 succeeded
Trying to fetch pmid 30137486
Trying genericCitationLabelled
Trying pubmed_central_v2
** fetching reprint using the 'pubmed central' finder...
** fetching of reprint 30137486 succeeded
Trying to fetch pmid 30389436
Trying genericCitationLabelled
Trying pubmed_central_v2
Trying acsPublications
Trying uchicagoPress
Trying nejm
Trying futureMedicine
Trying science_direct
Trying direct_pdf_link
** Reprint 30389436 could not be fetched with the current finders.
Trying to fetch pmid 30420454
Trying genericCitationLabelled
Trying pubmed_central_v2
Trying acsPublications
Trying uchicagoPress
Trying nejm
Trying futureMedicine
Trying science_direct
** fetching of reprint 30420454 failed from error Invalid URL 'Ldk64CcSIXJkCs8pPt2YjpLkZJhaKs_m0pmekzL5SOY-1672941544-0-AaBQfP_66yID5nRb-xUznbW2FVXAWeiOcKnTosB_FUSxeeNSucukyHbo7OxAcJttrfA4pKDGasC8MvQz6o0cFBZv0VU_RfYXUn7Z6iVg5eVp7n_O4P9Zzk0IiE132EMNR-Xn0_gEfYM8DMCX5lS4yEgrs9hwhdJIWzS6N3fsDsa3kDjIH9oELTaDEbTbFvUXEkx3212-4NJ6SwCvfUzhtolsD7xJoswFQHjNBFrmUgScEORQpIWTWxzHPvpGTxepQMUPuAEbgNNykNbdp9oyLMDwmUnIqU7hSmeCkYU1RWlbxh95rcgH-yvV9mm3RQnIXT3WfcUE9lM5crnbBcplVCA4jbLP7kk1tu_BFbh-6gstCr0B24gEE5zJ41WGxwTbABhAmK7aAeHbH7V55EBpLOQcpkYhWZNiMMbVsG314TM_tE9UGM8B99FmrWUqCqwMcsGwDDWK7B-uHcDD5nJxQhgV5SlMnS0IVE18Bdu4zqIzT3ZS2sgGf9Drti4P5Qkso3v1pW_fBzq-Mrd6_O7cvwF7FlRc95tOXSjjS0Woc70HGNBNd1kc0ZR9NuwV9TnvPRWbuoYu3HXz65DeWmbGaLOFdHAOUARr1fD9DL9LRDmeAHOGkYkplz9pSbWXR6vYkIInqFnvQKuwhnVOltaWa6_VG3BH0oc9T4xAZdH83DsG6eHtJlitVhH8Sx_PBfukG4x0S1qsmIWUPDwZhwUe55ly0I5ISELLL8Z3tAJpq3zrdyV6CbwOjF7-nPF7aRNuxg': No schema supplied. Perhaps you meant http://Ldk64CcSIXJkCs8pPt2YjpLkZJhaKs_m0pmekzL5SOY-1672941544-0-AaBQfP_66yID5nRb-xUznbW2FVXAWeiOcKnTosB_FUSxeeNSucukyHbo7OxAcJttrfA4pKDGasC8MvQz6o0cFBZv0VU_RfYXUn7Z6iVg5eVp7n_O4P9Zzk0IiE132EMNR-Xn0_gEfYM8DMCX5lS4yEgrs9hwhdJIWzS6N3fsDsa3kDjIH9oELTaDEbTbFvUXEkx3212-4NJ6SwCvfUzhtolsD7xJoswFQHjNBFrmUgScEORQpIWTWxzHPvpGTxepQMUPuAEbgNNykNbdp9oyLMDwmUnIqU7hSmeCkYU1RWlbxh95rcgH-yvV9mm3RQnIXT3WfcUE9lM5crnbBcplVCA4jbLP7kk1tu_BFbh-6gstCr0B24gEE5zJ41WGxwTbABhAmK7aAeHbH7V55EBpLOQcpkYhWZNiMMbVsG314TM_tE9UGM8B99FmrWUqCqwMcsGwDDWK7B-uHcDD5nJxQhgV5SlMnS0IVE18Bdu4zqIzT3ZS2sgGf9Drti4P5Qkso3v1pW_fBzq-Mrd6_O7cvwF7FlRc95tOXSjjS0Woc70HGNBNd1kc0ZR9NuwV9TnvPRWbuoYu3HXz65DeWmbGaLOFdHAOUARr1fD9DL9LRDmeAHOGkYkplz9pSbWXR6vYkIInqFnvQKuwhnVOltaWa6_VG3BH0oc9T4xAZdH83DsG6eHtJlitVhH8Sx_PBfukG4x0S1qsmIWUPDwZhwUe55ly0I5ISELLL8Z3tAJpq3zrdyV6CbwOjF7-nPF7aRNuxg?
Trying to fetch pmid 33172971
Trying genericCitationLabelled
Trying pubmed_central_v2
Trying acsPublications
Trying uchicagoPress
Trying nejm
Trying futureMedicine
Trying science_direct
** fetching of reprint 33172971 failed from error Invalid URL 'cIp5BepthG7srw3ecaG_06Qhx8PrpoP6WpFwlAopfrE-1672941545-0-AXmWkR9H8dN8IxbsllbsHZ1SigvIVLyZ0euRPz15XW6nX_MsA3Y9dPoL0MovKxj_yUNiDnSYrVSmYNzVo-LEANJZq45ZpzDVv1GFU1qNu0PpI-0YxuWHz4dSudrD_soFz3LsTCtgLamU66ZDSsrVNGTaqqbajetYgnnhu4K-BeYnLmpOxzcMzYU9mgynHjFv0NnrdUU75kJPeOIRpgrUlqm8JRnkMq0SEvI2IPTDW0ToohbWs4bLvLX0GNKGVT_v5to_am4hEVPC9jmkfkkNOLoMmfbnZC-L2EGAKufwZgz17d89HWfaK61no8EW8y5ysZ5A9yTRfN__C_LpTG6FWw2HWyR9FgIvz799f4ysIoz52azp4a7w3G3AHCWdUBUDy6gabo_psIE4mu3dCHLcDzGNO148UT5wzxTfrQV3aatPAWjnaK6-Re0XOkABNINniMLfF6Ti-0WgY-cHyLH2RgKISy_89MeNrVJy22GToy2c_LQwZN3RT3M8M2TFXLXmi9xEE4Z_4kSRA_aRnvRjKJdMJfxhc-BYW1G-dn2SDAetNZZL7HcJW6cGlAIjNWQqTD9ieGfLxGJe0OCLysFkeY3XwRY5vTHQ-xVI-gGKBY0A9gS70DH5t_pS53fBTQZ1pK667ct-BCo5aysuQHLcXrlE9coo8k8vtQKrmQ5-Fxp2ZNV_MKLY5yqBj5yAWJI5b_O-Mp5TyE9Zzyte_cTXqYtO14DAr1ev8TwqZP3YNbunHBcvIO20uVjbNxc4m--ARi5MMuxcDg4Kvju8Dbf3YKM': No schema supplied. Perhaps you meant http://cIp5BepthG7srw3ecaG_06Qhx8PrpoP6WpFwlAopfrE-1672941545-0-AXmWkR9H8dN8IxbsllbsHZ1SigvIVLyZ0euRPz15XW6nX_MsA3Y9dPoL0MovKxj_yUNiDnSYrVSmYNzVo-LEANJZq45ZpzDVv1GFU1qNu0PpI-0YxuWHz4dSudrD_soFz3LsTCtgLamU66ZDSsrVNGTaqqbajetYgnnhu4K-BeYnLmpOxzcMzYU9mgynHjFv0NnrdUU75kJPeOIRpgrUlqm8JRnkMq0SEvI2IPTDW0ToohbWs4bLvLX0GNKGVT_v5to_am4hEVPC9jmkfkkNOLoMmfbnZC-L2EGAKufwZgz17d89HWfaK61no8EW8y5ysZ5A9yTRfN__C_LpTG6FWw2HWyR9FgIvz799f4ysIoz52azp4a7w3G3AHCWdUBUDy6gabo_psIE4mu3dCHLcDzGNO148UT5wzxTfrQV3aatPAWjnaK6-Re0XOkABNINniMLfF6Ti-0WgY-cHyLH2RgKISy_89MeNrVJy22GToy2c_LQwZN3RT3M8M2TFXLXmi9xEE4Z_4kSRA_aRnvRjKJdMJfxhc-BYW1G-dn2SDAetNZZL7HcJW6cGlAIjNWQqTD9ieGfLxGJe0OCLysFkeY3XwRY5vTHQ-xVI-gGKBY0A9gS70DH5t_pS53fBTQZ1pK667ct-BCo5aysuQHLcXrlE9coo8k8vtQKrmQ5-Fxp2ZNV_MKLY5yqBj5yAWJI5b_O-Mp5TyE9Zzyte_cTXqYtO14DAr1ev8TwqZP3YNbunHBcvIO20uVjbNxc4m--ARi5MMuxcDg4Kvju8Dbf3YKM?
Trying to fetch pmid 34428301
Trying genericCitationLabelled
Trying pubmed_central_v2
** fetching reprint using the 'pubmed central' finder...
** fetching of reprint 34428301 succeeded
Trying to fetch pmid 34791440
Trying genericCitationLabelled
Trying pubmed_central_v2
** fetching reprint using the 'pubmed central' finder...
** fetching of reprint 34791440 succeeded
Trying to fetch pmid 9140061
Trying genericCitationLabelled
Trying pubmed_central_v2
Trying acsPublications
Trying uchicagoPress
Trying nejm
Trying futureMedicine
Trying science_direct
** fetching of reprint 9140061 failed from error Invalid URL 'jcZKP5wy_U5fZwQzMnXsls3TZuFKtLfM.TRt.Bh4d9k-1672941556-0-ARg_NFIhKoWkMSTkM9K0NNsEvXccNV6TpzvRoQ-2vYjK8XkBbOYBTbXDk7ayo2JehzHWQXv5Q_R2fac_6YNLMbXVLvOx_MPE2G55FZnUH5eyYoAVkc294_DbWF4BkOBr9bbRZ77KShHUYqJjAOi2O7mvSeGRhr8aCrq258YcVJ0FBdP8Q5tuy2CNWxi_udpInouGKC_Bnbb4D6LtrmOH2qchHRdKNei5ina55N2xPiH6jVDZ21jK0SkCSagtetSHnT7A-CfaFwqG5cz5lnOs1l1bBFEcOFdNNkmvz5yGZK-RR1-gynCmgS1ixfHapDjmCyogIfAxI1oumhPQoHCCg8-OqSgMSXHbgJdPWvc5L68Unmk5BAZNeFU2F_-xInoVtpYPwJNkeyldxj98PbHPAYg-SqmRtv0MyKm9qcEZJIULlfwTZ2ZGAm_uAwcQ7fW_O9VfUNBlbt2SohoYWfCtILAc2Imgon6vNbdisaxRkf70SZuD0G-Fj2SCsAYhkQrqPCdJAEEfWJ1QiddGb32kTSnXCoupFAWbX441Xj4nOj5OaRem_6JScd2AJp-YxSNI0Nm4IrB8s5O_lG1o_BDYlplFwbKozatP9ckn0jeXx38wInIuKOjUgl9B_T2Xvkg6sCNxXUWsHXiHMkhQ3x2AEh47zf4T6vQoTi0wNMkUVtkNTh8gOviKKl74Pi4m3yyq1ICnA9L9D6E6MLuE_ZOfmVBM79sVEgN8jsDBojevmYv96r09rQaQ_9c5cFZk7E25jQ': No schema supplied. Perhaps you meant http://jcZKP5wy_U5fZwQzMnXsls3TZuFKtLfM.TRt.Bh4d9k-1672941556-0-ARg_NFIhKoWkMSTkM9K0NNsEvXccNV6TpzvRoQ-2vYjK8XkBbOYBTbXDk7ayo2JehzHWQXv5Q_R2fac_6YNLMbXVLvOx_MPE2G55FZnUH5eyYoAVkc294_DbWF4BkOBr9bbRZ77KShHUYqJjAOi2O7mvSeGRhr8aCrq258YcVJ0FBdP8Q5tuy2CNWxi_udpInouGKC_Bnbb4D6LtrmOH2qchHRdKNei5ina55N2xPiH6jVDZ21jK0SkCSagtetSHnT7A-CfaFwqG5cz5lnOs1l1bBFEcOFdNNkmvz5yGZK-RR1-gynCmgS1ixfHapDjmCyogIfAxI1oumhPQoHCCg8-OqSgMSXHbgJdPWvc5L68Unmk5BAZNeFU2F_-xInoVtpYPwJNkeyldxj98PbHPAYg-SqmRtv0MyKm9qcEZJIULlfwTZ2ZGAm_uAwcQ7fW_O9VfUNBlbt2SohoYWfCtILAc2Imgon6vNbdisaxRkf70SZuD0G-Fj2SCsAYhkQrqPCdJAEEfWJ1QiddGb32kTSnXCoupFAWbX441Xj4nOj5OaRem_6JScd2AJp-YxSNI0Nm4IrB8s5O_lG1o_BDYlplFwbKozatP9ckn0jeXx38wInIuKOjUgl9B_T2Xvkg6sCNxXUWsHXiHMkhQ3x2AEh47zf4T6vQoTi0wNMkUVtkNTh8gOviKKl74Pi4m3yyq1ICnA9L9D6E6MLuE_ZOfmVBM79sVEgN8jsDBojevmYv96r09rQaQ_9c5cFZk7E25jQ?
Trying to fetch pmid 32662815
Trying genericCitationLabelled
Trying pubmed_central_v2
** fetching reprint using the 'pubmed central' finder...
** fetching of reprint 32662815 succeeded
Trying to fetch pmid 32817380
Trying genericCitationLabelled
Trying pubmed_central_v2
Trying acsPublications
Trying uchicagoPress
Trying nejm
Trying futureMedicine
Trying science_direct
** fetching of reprint 32817380 failed from error Invalid URL '12Kdq1Gu6s3H_c9wV7MWTwYP1d4sz7FRpWv08o2fehs-1672941561-0-AQzKoSKCDX92o3mW4XorQJa2qF1s8dsMn24r1239tDd-OxIEJ-xofWlfZb7cmDWmkZ-d4uCYOdMgyimJ9BwqBkuJKbRguJ_HaG4KzuT0CwTAflqmSgiP6oaZRbxRIMOl3LAnhQXawFffYKLbyEKG9hEWBeEbs31LlzwG7k7IbodBBPNfYicYC2QJy8RZ5xHWPTXxcwshhdG__QByEK9fJ6RYaR8LVhOwXo-m6nKcnmcvdFAubYorAvVvggpPCiIA0EYouK_-KA_Et9mXMtoRPVhEKeO03k9LAejSpvDDd8praPe4uYMGyBe4ruFtFbjqOdJgmlwSt_hPsHu_iFLkl6eW-V_dW5iwEQOE9z1jSjKf1ZHznUnde5Nzlh3v0wV2po1Y1QuFKuy8_IO-DB4iU3MlzHKgWqCsAeLorSaui7KqJAGzqmM3Keurq7J4URVd8khAGmHXMZHt3u96krRlFp3Nsc1_jwJEKKLxr44FVFla7XnqlQIHXdzj9FffjdPd1R_p3G-UEYGLzL32dFulkql4INTbOR625BrjoAvw74XDQRcNE_P72PYyCRUSIarPTtFTQBMSfpxRaOprcTZfMR_U5zdY0uGixU3srbPeduCUA7tQOFiCiLoTD_odsa75NYCv9o_me1vJSA823Md4hCV947suGwjybNaQP-R-yrffAfni7dQRYMt-mjEHk5LtnebhpJi1G44UN_WFSDpOkB6lvKO7Qc-eoUXnm4DbeysVDTRAmVi94HkcG9tc1U7BeSVyXNUfq3C1Vr_1jJXCgUI': No schema supplied. Perhaps you meant http://12Kdq1Gu6s3H_c9wV7MWTwYP1d4sz7FRpWv08o2fehs-1672941561-0-AQzKoSKCDX92o3mW4XorQJa2qF1s8dsMn24r1239tDd-OxIEJ-xofWlfZb7cmDWmkZ-d4uCYOdMgyimJ9BwqBkuJKbRguJ_HaG4KzuT0CwTAflqmSgiP6oaZRbxRIMOl3LAnhQXawFffYKLbyEKG9hEWBeEbs31LlzwG7k7IbodBBPNfYicYC2QJy8RZ5xHWPTXxcwshhdG__QByEK9fJ6RYaR8LVhOwXo-m6nKcnmcvdFAubYorAvVvggpPCiIA0EYouK_-KA_Et9mXMtoRPVhEKeO03k9LAejSpvDDd8praPe4uYMGyBe4ruFtFbjqOdJgmlwSt_hPsHu_iFLkl6eW-V_dW5iwEQOE9z1jSjKf1ZHznUnde5Nzlh3v0wV2po1Y1QuFKuy8_IO-DB4iU3MlzHKgWqCsAeLorSaui7KqJAGzqmM3Keurq7J4URVd8khAGmHXMZHt3u96krRlFp3Nsc1_jwJEKKLxr44FVFla7XnqlQIHXdzj9FffjdPd1R_p3G-UEYGLzL32dFulkql4INTbOR625BrjoAvw74XDQRcNE_P72PYyCRUSIarPTtFTQBMSfpxRaOprcTZfMR_U5zdY0uGixU3srbPeduCUA7tQOFiCiLoTD_odsa75NYCv9o_me1vJSA823Md4hCV947suGwjybNaQP-R-yrffAfni7dQRYMt-mjEHk5LtnebhpJi1G44UN_WFSDpOkB6lvKO7Qc-eoUXnm4DbeysVDTRAmVi94HkcG9tc1U7BeSVyXNUfq3C1Vr_1jJXCgUI?
Trying to fetch pmid 32849447
Trying genericCitationLabelled
Trying pubmed_central_v2
** fetching reprint using the 'pubmed central' finder...
** fetching of reprint 32849447 succeeded
Trying to fetch pmid 33068046
Trying genericCitationLabelled
Trying pubmed_central_v2
Trying acsPublications
Trying uchicagoPress
Trying nejm
Trying futureMedicine
Trying science_direct
** fetching of reprint 33068046 failed from error Invalid URL 'Ca2tz3FARdmZBsByuyNuWeiql2uee1VreT.kVjY7yrk-1672941567-0-AexEfZqeSZGpnAKuvb4N24mFbbARpMqS4Bl7rq2oJaJLQy4XNqEXY1SvQ53OXwzuh9s8hJpJSmZKZ90s8So4WTMitRZFt0iwKRvwq5PfF8ZF-spmYvUyZmqSAcRty7hyAnlIItHCbvd0DXymu2foqGLiY7_Azyn4oIZjqDWZgwUu4cttCsPTlTJtscKhrnIDiTC2AD-6BrcAHq2eFMQXn27imPIx1RCRlJshGeDr1vbtfjlBg89wEfvUQMpUEgz-xVlFP2tkES_AqE3RIqDBCDIDkDuwxhKZ5d-k_PxAuN3Vbx-1nlLI7WeIZH3b-qHkPWg8ifOx6RsMU_A02ZEHMrjlftm66SFQ60Wsria5dpTeLxvGd34BBngLodgDKaYoG0ztHkPImcz4lT76J7-QCgKcV7O86u_4mEpHhONMbCRBLtVhcFVAX-zAMIyOWzECJ6x0Sau9cAqssr2l_Q1VT-f4uCaFA5KpmuC3IHUZQABkrvM9nh0uOhB2e7ln9OfxBG89KhjhGPRhio2LRDY4yprcBdzS-dNl1pedPEXENepuOg0R645bq0poGP4uKeYHuQ': No schema supplied. Perhaps you meant http://Ca2tz3FARdmZBsByuyNuWeiql2uee1VreT.kVjY7yrk-1672941567-0-AexEfZqeSZGpnAKuvb4N24mFbbARpMqS4Bl7rq2oJaJLQy4XNqEXY1SvQ53OXwzuh9s8hJpJSmZKZ90s8So4WTMitRZFt0iwKRvwq5PfF8ZF-spmYvUyZmqSAcRty7hyAnlIItHCbvd0DXymu2foqGLiY7_Azyn4oIZjqDWZgwUu4cttCsPTlTJtscKhrnIDiTC2AD-6BrcAHq2eFMQXn27imPIx1RCRlJshGeDr1vbtfjlBg89wEfvUQMpUEgz-xVlFP2tkES_AqE3RIqDBCDIDkDuwxhKZ5d-k_PxAuN3Vbx-1nlLI7WeIZH3b-qHkPWg8ifOx6RsMU_A02ZEHMrjlftm66SFQ60Wsria5dpTeLxvGd34BBngLodgDKaYoG0ztHkPImcz4lT76J7-QCgKcV7O86u_4mEpHhONMbCRBLtVhcFVAX-zAMIyOWzECJ6x0Sau9cAqssr2l_Q1VT-f4uCaFA5KpmuC3IHUZQABkrvM9nh0uOhB2e7ln9OfxBG89KhjhGPRhio2LRDY4yprcBdzS-dNl1pedPEXENepuOg0R645bq0poGP4uKeYHuQ?
Trying to fetch pmid 33072717
Trying genericCitationLabelled
Trying pubmed_central_v2
** fetching reprint using the 'pubmed central' finder...
** fetching of reprint 33072717 succeeded
Trying to fetch pmid 33136147
Trying genericCitationLabelled
** fetching reprint using the 'generic citation labelled' finder...
** fetching of reprint 33136147 succeeded
Trying to fetch pmid 33318048
Trying genericCitationLabelled
Trying pubmed_central_v2
Trying acsPublications
Trying uchicagoPress
Trying nejm
Trying futureMedicine
Trying science_direct
** fetching of reprint 33318048 failed from error Invalid URL 'H_HDOMjPFqBRBn9CIvFp3.MLyt6.Cr1yqjPqy7_.dNo-1672941585-0-ASGJyet_JMjh-n9RCjZP1usTaU-rAh_oVPlNBV8Ox06oZLjLmr4nLazOPGibTSzbDun4wRRfxjJD1cl8pFLvWgZNLCwgScfdMEuTEYcHelG8wh84ZPO7-PimWyY4a-Ax_JW2wfMsWOrdcFRmKRfdjpL4MFDyEGVMcjhzP9y84LW4EnDyNZqVSkX_y8VAxIbmaMeuS-EiSakyeV1RnV4_bKjzzuQXXLtk8wIhc-rF2VoAiFgTRP3-kR7Y02rLN1opo-OYhoQ29Xy2fAHIKm2pS-qBW0XNRWiOU6q8_YMmMZrWbskiukxzgyZO5MutUF8ygYDuzaZDjX0BtuezjJEtcKWslPbaM1gXj1L8Yy3U7YCwi-_CPUjNOrvFnW0EEm5jeKDUVwwIeY1-sd54wUjlnn86c6qAqpaI4unKjLk4makfoIUlKr4B62VwsTRnrfbZxbqDTyl5jZjFIGiHmrmzPXxt1QG7SrQwApYoGQYFiijEUTw-7IM3t7bXcwRYTMVfbXUEhv8JrzvShSa8x1fDEwHgU2fUnY6BoCOrpC9hZShy1xlZSOOpw4AHgCW272GoFIr2PJ-Zy1UNze2TXebaUegtUpleiM-BhsDhqCAaGxAj9SQsD153z7wtiM6kCMnHOz9IhaKIkgYpKYgXwcQmuzLZUWgWJFJ0lqYeSgvlKAgHjzRY_3Jt2gPT3L2GcgUZQXWWRx4Hs4jL2tUvAiOuqPfvPWSFVjGTZPjZCd3VVFrqpcZCh2v85PiksdgNk05aMA': No schema supplied. Perhaps you meant http://H_HDOMjPFqBRBn9CIvFp3.MLyt6.Cr1yqjPqy7_.dNo-1672941585-0-ASGJyet_JMjh-n9RCjZP1usTaU-rAh_oVPlNBV8Ox06oZLjLmr4nLazOPGibTSzbDun4wRRfxjJD1cl8pFLvWgZNLCwgScfdMEuTEYcHelG8wh84ZPO7-PimWyY4a-Ax_JW2wfMsWOrdcFRmKRfdjpL4MFDyEGVMcjhzP9y84LW4EnDyNZqVSkX_y8VAxIbmaMeuS-EiSakyeV1RnV4_bKjzzuQXXLtk8wIhc-rF2VoAiFgTRP3-kR7Y02rLN1opo-OYhoQ29Xy2fAHIKm2pS-qBW0XNRWiOU6q8_YMmMZrWbskiukxzgyZO5MutUF8ygYDuzaZDjX0BtuezjJEtcKWslPbaM1gXj1L8Yy3U7YCwi-_CPUjNOrvFnW0EEm5jeKDUVwwIeY1-sd54wUjlnn86c6qAqpaI4unKjLk4makfoIUlKr4B62VwsTRnrfbZxbqDTyl5jZjFIGiHmrmzPXxt1QG7SrQwApYoGQYFiijEUTw-7IM3t7bXcwRYTMVfbXUEhv8JrzvShSa8x1fDEwHgU2fUnY6BoCOrpC9hZShy1xlZSOOpw4AHgCW272GoFIr2PJ-Zy1UNze2TXebaUegtUpleiM-BhsDhqCAaGxAj9SQsD153z7wtiM6kCMnHOz9IhaKIkgYpKYgXwcQmuzLZUWgWJFJ0lqYeSgvlKAgHjzRY_3Jt2gPT3L2GcgUZQXWWRx4Hs4jL2tUvAiOuqPfvPWSFVjGTZPjZCd3VVFrqpcZCh2v85PiksdgNk05aMA?
```
# Text extraction from PDF
We sent PDf files to Lisen&Curate team for extracting text.
21097887
23818864
24947454
25222563
25275371
25735747
26258987
26279566
26670385
26673755
28061857
28526842
29394395
30137486
30389436
30420454
33172971
34428301
34791440
9140061
32662815
32817380
32849447
33068046
33072717
33136147
33318048