Gathering input document collection

cmendezc
Commit 12e7dd61a84416d65e0e2b0e7b0b3468984190ec 12e7dd61 1 parent 92c2d61e
Showing 4 changed files with 808 additions and 0 deletions
data-sets/bin/extract-sentences-from-softfiles_v1.py
data-sets/bin/extract-sentences-from-softfiles_v2.py
extraction-literature/input/README.md
extraction-literature/input/list_of_PMIDs.txt
--- a/data-sets/bin/extract-sentences-from-softfiles_v1.py 0 → 100644
View file @12e7dd6
+++ b/data-sets/bin/extract-sentences-from-softfiles_v1.py 0 → 100644
View file @12e7dd6
+ import stanza
+ import argparse
+ import re
+ import os
+ import pandas as pd
+ 
+ # Objective
+ # Sentences extraction from XML Soft files.
+ #
+ # Input parameters
+ # --inputPath=PATH    		Path to XML Soft files
+ # --outputPath=PATH   		Path to place output files
+ #
+ # Output
+ # Files with sentences obtained from XML Soft files
+ #
+ # Examples
+ # python extract-sentences-from-softfiles.py
+ # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
+ # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+ #
+ # python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = argparse.ArgumentParser(
+         prog='extract-sentences-from-softfiles',
+         description='Sentences extraction from XML Soft files.',
+         epilog='')
+     parser.add_argument("--inputPath", dest="inputPath",
+                         help="Path to XML Soft files", metavar="PATH")
+     parser.add_argument("--outputPath", dest="outputPath",
+                         help="Path for output files", metavar="PATH")
+ 
+     args = parser.parse_args()
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path to XML Soft files: " + args.inputPath)
+     print("Path to output files: " + args.outputPath)
+     print('-------------------------------- PROCESSING --------------------------------')
+ 
+     ## Tags of GCs into consideration
+     # culture medium, medium supplements, aeration, temperature,
+     # pH, agitation, growth phase, optical density, genetic background
+     tags = {
+         '<Gtype>': 'Gtype',
+         # '<Gversion>': 'Gversion',
+         '<Med>': 'Med',
+         '<Phase>': 'Phase',
+         # '<Substrain>': 'Substrain',
+         '<Supp>': 'Supp',
+         # '<Strain>': 'Strain',
+         # '<Technique>': 'Technique',
+         '<Temp>': 'Temp',
+         '<OD>': 'OD',
+         '<Anti>': 'Anti',
+         '<Agit>': 'Agit',
+         '<Air>': 'Air',
+         '<Vess>': 'Vess',
+         '<pH>': 'pH'
+     }
+     #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
+     #        '<Temp>', '<OD>', '<Anti>', '<Agit>',
+     #        '<Air>', '<Vess>', '<pH>']
+     #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
+     tags = ['Gtype', 'Med', 'Phase', 'Supp',
+             'Temp', 'OD', 'Anti', 'Agit',
+             'Air', 'Vess', 'pH']
+     deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique']
+     all_tags = tags + deleted_tags
+     # Regex to check if line has a tag
+     regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
+     # Regex to delete tags
+     regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
+     # Regex to substitute tags
+     regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
+     regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
+     #p = re.compile(r'blue (?P<animal>dog|cat)')
+     #p.sub(r'gray \g<animal>', s)
+     # Regex to tag GCs
+     regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
+     regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
+ 
+     # Testing file: GSE54899_family_retagged-05242019_validated.xml
+     testing_file = "GSE54899_family_retagged-05242019_validated.xml"
+ 
+     # Define stanza pipeline for sentence segmentation
+     nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
+     # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
+     nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
+ 
+     # Store field_name (bangline) and field_text
+     field_name = ""
+     field_text = ""
+ 
+     # Store list of unique field_name
+     hash_field_name = {}
+ 
+     # Store sentences from fields that contained at least one GC tag.
+     #   We want to use this list for someone to check it
+     df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
+ 
+     # Store serie number
+     #   ^SERIES = GSE54899
+     serie = ""
+     # Store series pubmed id
+     #   !Series_pubmed_id = 25222563
+     serie_pubmed_id = ""
+     # Store sample
+     #   ^SAMPLE = GSM1326335
+     sample = ""
+ 
+     for path, dirs, files in os.walk(args.inputPath):
+         # For each file in dir
+         for file in files:
+             if file == testing_file:
+                 print("   Reading file..." + str(file))
+                 with open(os.path.join(args.inputPath, file)) as iFile:
+                     for line in iFile:
+                         line = line.rstrip('\n')
+                         if line.find(" = ") == -1:
+                             continue
+                         list_line = line.split(" = ")
+                         field_name = list_line[0]
+                         #print("field_name: {}".format(field_name))
+                         field_text = list_line[1]
+                         #print("field_text: {}".format(field_text))
+                         if field_name == "^SERIES":
+                             serie = field_text
+                         elif field_name == "!Series_pubmed_id":
+                             serie_pubmed_id = field_text
+                         elif field_name == "^SAMPLE":
+                             sample = field_text
+                         elif regex_has_tag.search(line):    # Contains GC tag
+                             if field_name in hash_field_name:
+                                 hash_field_name[field_name] += 1
+                             else:
+                                 hash_field_name[field_name] = 1
+                             original_sentence = field_text
+                             # delete GC tags
+                             modified_sentence = regex_delete_tag.sub("", field_text)
+                             modified_sentence = regex_delete_tag.sub("", modified_sentence)
+                             # substitute tags
+                             # p = re.compile(r'blue (?P<animal>dog|cat)')
+                             # p.sub(r'gray \g<animal>', s)
+                             modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
+                             modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
+                             doc = nlp(modified_sentence)
+                             for i, sentence in enumerate(doc.sentences):
+                                 # print(sentence.text)
+                                 list_transformed_sentence = []
+                                 # For GC tag
+                                 gc_tag = "O"
+                                 in_tag = False
+                                 for word in sentence.words:
+                                     result = regex_gc_ini_tag.match(word.text)
+                                     if result:
+                                         gc_tag = result.group("tag")
+                                         in_tag = True
+                                         continue
+                                     else:
+                                         result = regex_gc_end_tag.match(word.text)
+                                         if result:
+                                             gc_tag = "O"
+                                             in_tag = False
+                                             continue
+                                         else:
+                                             if not in_tag:
+                                                 gc_tag = "O"
+                                     list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
+                                 transformed_sentence = " ".join(list_transformed_sentence)
+                                 new_row = {'serie': serie,
+                                            'serie_pubmed_id': serie_pubmed_id,
+                                            'sample': sample,
+                                            'field_name': field_name,
+                                            'original_sentence': original_sentence,
+                                            'modified_sentence': sentence.text,
+                                            'transformed_sentence': transformed_sentence}
+                                 df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
+     df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
+                                     #print(token)
+     quit()
+ 
+     ## End of tagging
+     out_labels = {
+         '</Gtype>': 'O',
+         '</Gversion>': 'O',
+         '</Med>': 'O',
+         '</Phase>': 'O',
+         '</Substrain>': 'O',
+         '</Supp>': 'O',
+         '</Strain>': 'O',
+         '</Technique>': 'O',
+         '</Temp>': 'O',
+         '</OD>': 'O',
+         '</Anti>': 'O',
+         '</Agit>': 'O',
+         '</Air>': 'O',
+         '</Vess>': 'O',
+         '</pH>': 'O'}
+     old_labels = {
+         '<Orgn>': 'O',
+         '</Orgn>': 'O'
+     }
+ 
+     # Other label
+     flag = 'O'
+     lista = []
+     # First sentence
+     sentence = ''
+     n = 0
+     with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file:
+         for line in input_file:
+             if len(line.split('\t')) > 1:
+                 w = line.split('\t')[1]
+                 if w in in_labels or w in out_labels:
+                     # Tagging
+                     if w in in_labels.keys(): flag = in_labels[w]
+                     if w in out_labels: flag = out_labels[w]
+                 else:
+                     if w == "PGCGROWTHCONDITIONS":
+                         n = n + 1
+                         words = sentence.split(' ')
+                         # End of sentence
+                         tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()]
+                         # At least one true-tag on sentence
+                         if len(tags) > 0:
+                             lista.append(sentence)
+                             # New setence
+                         sentence = ''
+                     elif w not in old_labels.keys():
+                         # Building and save tagging sentence
+                         sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ')
+ 
+     print("Number of sentences with at least one tag: " + str(len(lista)))
+     print("Number of sentences from CoreNLP: " + str(n))
+ 
+     # Split 70 30 training and test sentences
+     trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70))
+     testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
+     print("Number of sentences for training: " + str(len(trainingIndex)))
+     print("Number of sentences for test: " + str(len(testIndex)))
+ 
+     with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile:
+         Data = [lista[i] for i in trainingIndex]
+         oFile.write('\n'.join(Data))
+ 
+     with open(os.path.join(args.outputPath, args.testFile), "w") as oFile:
+         Data = [lista[i] for i in testIndex]
+         oFile.write('\n'.join(Data))
+ 
+     print("==================================END===================================")
--- a/data-sets/bin/extract-sentences-from-softfiles_v2.py 0 → 100644
View file @12e7dd6
+++ b/data-sets/bin/extract-sentences-from-softfiles_v2.py 0 → 100644
View file @12e7dd6
+ import stanza
+ import argparse
+ import re
+ import os
+ import pandas as pd
+ 
+ # Objective
+ # Sentences extraction from XML Soft files.
+ #
+ # Input parameters
+ # --inputPath=PATH    		Path to XML Soft files
+ # --outputPath=PATH   		Path to place output files
+ #
+ # Output
+ # Files with sentences obtained from XML Soft files
+ #
+ # Examples
+ # python extract-sentences-from-softfiles.py
+ # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
+ # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+ #
+ # python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = argparse.ArgumentParser(
+         prog='extract-sentences-from-softfiles',
+         description='Sentences extraction from XML Soft files.',
+         epilog='')
+     parser.add_argument("--inputPath", dest="inputPath",
+                         help="Path to XML Soft files", metavar="PATH")
+     parser.add_argument("--outputPath", dest="outputPath",
+                         help="Path for output files", metavar="PATH")
+ 
+     args = parser.parse_args()
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path to XML Soft files: " + args.inputPath)
+     print("Path to output files: " + args.outputPath)
+     print('-------------------------------- PROCESSING --------------------------------')
+ 
+     ## Tags of GCs into consideration
+     # culture medium, medium supplements, aeration, temperature,
+     # pH, agitation, growth phase, optical density, genetic background
+     tags = {
+         '<Gtype>': 'Gtype',
+         # '<Gversion>': 'Gversion',
+         '<Med>': 'Med',
+         '<Phase>': 'Phase',
+         # '<Substrain>': 'Substrain',
+         '<Supp>': 'Supp',
+         # '<Strain>': 'Strain',
+         # '<Technique>': 'Technique',
+         '<Temp>': 'Temp',
+         '<OD>': 'OD',
+         '<Anti>': 'Anti',
+         '<Agit>': 'Agit',
+         '<Air>': 'Air',
+         '<Vess>': 'Vess',
+         '<pH>': 'pH'
+     }
+     #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
+     #        '<Temp>', '<OD>', '<Anti>', '<Agit>',
+     #        '<Air>', '<Vess>', '<pH>']
+     #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
+     tags = ['Gtype', 'Med', 'Phase', 'Supp',
+             'Temp', 'OD', 'Anti', 'Agit',
+             'Air', 'Vess', 'pH']
+     deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique']
+     all_tags = tags + deleted_tags
+     # Regex to check if line has a tag
+     regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
+     # Regex to delete tags
+     regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
+     # Regex to substitute tags
+     regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
+     regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
+     #p = re.compile(r'blue (?P<animal>dog|cat)')
+     #p.sub(r'gray \g<animal>', s)
+     # Regex to tag GCs
+     regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
+     regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
+ 
+     # Testing file: GSE54899_family_retagged-05242019_validated.xml
+     testing_file = "GSE54899_family_retagged-05242019_validated.xml"
+ 
+     # Define stanza pipeline for sentence segmentation
+     nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
+     # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
+     nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
+ 
+     # Store field_name (bangline) and field_text
+     field_name = ""
+     field_text = ""
+ 
+     # Store list of unique field_name
+     hash_field_name = {}
+ 
+     # Store sentences from fields that contained at least one GC tag.
+     #   We want to use this list for someone to check it
+     df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
+ 
+     # Store serie number
+     #   ^SERIES = GSE54899
+     serie = ""
+     # Store series pubmed id
+     #   !Series_pubmed_id = 25222563
+     serie_pubmed_id = ""
+     # Store sample
+     #   ^SAMPLE = GSM1326335
+     sample = ""
+ 
+     for path, dirs, files in os.walk(args.inputPath):
+         # For each file in dir
+         for file in files:
+             if file == testing_file:
+                 print("   Reading file..." + str(file))
+                 with open(os.path.join(args.inputPath, file)) as iFile:
+                     for line in iFile:
+                         line = line.rstrip('\n')
+                         if line.find(" = ") == -1:
+                             continue
+                         list_line = line.split(" = ")
+                         field_name = list_line[0]
+                         #print("field_name: {}".format(field_name))
+                         field_text = list_line[1]
+                         #print("field_text: {}".format(field_text))
+                         if field_name == "^SERIES":
+                             serie = field_text
+                         elif field_name == "!Series_pubmed_id":
+                             serie_pubmed_id = field_text
+                         elif field_name == "^SAMPLE":
+                             sample = field_text
+                         elif regex_has_tag.search(line):    # Contains GC tag
+                             if field_name in hash_field_name:
+                                 hash_field_name[field_name] += 1
+                             else:
+                                 hash_field_name[field_name] = 1
+                             original_sentence = field_text
+                             # delete GC tags
+                             modified_sentence = regex_delete_tag.sub("", field_text)
+                             modified_sentence = regex_delete_tag.sub("", modified_sentence)
+                             # substitute tags
+                             # p = re.compile(r'blue (?P<animal>dog|cat)')
+                             # p.sub(r'gray \g<animal>', s)
+                             modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
+                             modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
+                             doc = nlp(modified_sentence)
+                             for i, sentence in enumerate(doc.sentences):
+                                 # print(sentence.text)
+                                 list_transformed_sentence = []
+                                 # For GC tag
+                                 gc_tag = "O"
+                                 in_tag = False
+                                 for word in sentence.words:
+                                     result = regex_gc_ini_tag.match(word.text)
+                                     if result:
+                                         gc_tag = result.group("tag")
+                                         in_tag = True
+                                         continue
+                                     else:
+                                         result = regex_gc_end_tag.match(word.text)
+                                         if result:
+                                             gc_tag = "O"
+                                             in_tag = False
+                                             continue
+                                         else:
+                                             if not in_tag:
+                                                 gc_tag = "O"
+                                     list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
+                                 transformed_sentence = " ".join(list_transformed_sentence)
+                                 new_row = {'serie': serie,
+                                            'serie_pubmed_id': serie_pubmed_id,
+                                            'sample': sample,
+                                            'field_name': field_name,
+                                            'original_sentence': original_sentence,
+                                            'modified_sentence': sentence.text,
+                                            'transformed_sentence': transformed_sentence}
+                                 df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
+     df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
+                                     #print(token)
+     quit()
+ 
+     ## End of tagging
+     out_labels = {
+         '</Gtype>': 'O',
+         '</Gversion>': 'O',
+         '</Med>': 'O',
+         '</Phase>': 'O',
+         '</Substrain>': 'O',
+         '</Supp>': 'O',
+         '</Strain>': 'O',
+         '</Technique>': 'O',
+         '</Temp>': 'O',
+         '</OD>': 'O',
+         '</Anti>': 'O',
+         '</Agit>': 'O',
+         '</Air>': 'O',
+         '</Vess>': 'O',
+         '</pH>': 'O'}
+     old_labels = {
+         '<Orgn>': 'O',
+         '</Orgn>': 'O'
+     }
+ 
+     # Other label
+     flag = 'O'
+     lista = []
+     # First sentence
+     sentence = ''
+     n = 0
+     with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file:
+         for line in input_file:
+             if len(line.split('\t')) > 1:
+                 w = line.split('\t')[1]
+                 if w in in_labels or w in out_labels:
+                     # Tagging
+                     if w in in_labels.keys(): flag = in_labels[w]
+                     if w in out_labels: flag = out_labels[w]
+                 else:
+                     if w == "PGCGROWTHCONDITIONS":
+                         n = n + 1
+                         words = sentence.split(' ')
+                         # End of sentence
+                         tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()]
+                         # At least one true-tag on sentence
+                         if len(tags) > 0:
+                             lista.append(sentence)
+                             # New setence
+                         sentence = ''
+                     elif w not in old_labels.keys():
+                         # Building and save tagging sentence
+                         sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ')
+ 
+     print("Number of sentences with at least one tag: " + str(len(lista)))
+     print("Number of sentences from CoreNLP: " + str(n))
+ 
+     # Split 70 30 training and test sentences
+     trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70))
+     testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
+     print("Number of sentences for training: " + str(len(trainingIndex)))
+     print("Number of sentences for test: " + str(len(testIndex)))
+ 
+     with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile:
+         Data = [lista[i] for i in trainingIndex]
+         oFile.write('\n'.join(Data))
+ 
+     with open(os.path.join(args.outputPath, args.testFile), "w") as oFile:
+         Data = [lista[i] for i in testIndex]
+         oFile.write('\n'.join(Data))
+ 
+     print("==================================END===================================")
--- a/extraction-literature/input/README.md 0 → 100644
View file @12e7dd6
+++ b/extraction-literature/input/README.md 0 → 100644
View file @12e7dd6
+ # Input article collection
+ We used list of PMIDs from article collections delivered by curators (Víctor, Soco, Paloma).
+ DRIVE (https://docs.google.com/spreadsheets/d/1OayfQ7ODgnU4d5PQ3SUAmFX3Tc27PocCHZ6flPXwLKc/edit?usp=sharing)
+ Asana (https://app.asana.com/0/1200927210854847/1203428992254399/f)
+ 
+ # Download PDFs
+ We used [Pubmed-Batch-Download](https://github.com/billgreenwald/Pubmed-Batch-Download) tool to download PDF files. 
+ ## Installation
+ ```shell
+ (base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ git clone https://github.com/billgreenwald/Pubmed-Batch-Download.git
+ Cloning into 'Pubmed-Batch-Download'...
+ remote: Enumerating objects: 202, done.
+ remote: Counting objects: 100% (12/12), done.
+ remote: Compressing objects: 100% (12/12), done.
+ remote: Total 202 (delta 5), reused 0 (delta 0), pack-reused 190
+ Receiving objects: 100% (202/202), 31.23 MiB | 1.09 MiB/s, done.
+ Resolving deltas: 100% (102/102), done.
+ (base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ mv Pubmed-Batch-Download/ github-Pubmed-Batch-Download
+ (base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ cd github-Pubmed-Batch-Download/
+ (base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ ls -l
+ total 52
+ -rw-rw-r-- 1 cmendezc cmendezc    72 ene  5 11:31 example_pmf.tsv
+ -rw-rw-r-- 1 cmendezc cmendezc 11430 ene  5 11:31 fetch_pdfs.py
+ -rw-rw-r-- 1 cmendezc cmendezc 18711 ene  5 11:31 fetch_pdfs_toScript.ipynb
+ -rw-rw-r-- 1 cmendezc cmendezc   551 ene  5 11:31 pubmed-batch-downloader-py3-windows.yml
+ -rw-rw-r-- 1 cmendezc cmendezc   895 ene  5 11:31 pubmed-batch-downloader-py3.yml
+ -rw-rw-r-- 1 cmendezc cmendezc  3667 ene  5 11:31 README.md
+ drwxrwxr-x 2 cmendezc cmendezc  4096 ene  5 11:31 ruby_version
+ -rw-rw-r-- 1 cmendezc cmendezc     0 ene  5 11:31 unfetched_pmids.tsv
+ (base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda env create -f pubmed-batch-downloader-py3.yml
+ ```
+ ## Testing
+ Error!
+ ```shell
+ (base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda activate pubmed-batch-downloader-py3
+ (pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ python fetch_pdfs.py -pmf example_pmf.tsv
+ Traceback (most recent call last):
+   File "fetch_pdfs.py", line 64, in <module>
+     from bs4 import BeautifulSoup
+ ModuleNotFoundError: No module named 'bs4'
+ ```
+ Fix 1: Install bs4
+ ```shell
+ (pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ pip install bs4
+ Collecting bs4
+   Using cached https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
+ Collecting beautifulsoup4 (from bs4)
+   Using cached https://files.pythonhosted.org/packages/9c/d8/909c4089dbe4ade9f9705f143c9f13f065049a9d5e7d34c828aefdd0a97c/beautifulsoup4-4.11.1-py3-none-any.whl
+ Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
+   Using cached https://files.pythonhosted.org/packages/16/e3/4ad79882b92617e3a4a0df1960d6bce08edfb637737ac5c3f3ba29022e25/soupsieve-2.3.2.post1-py3-none-any.whl
+ Building wheels for collected packages: bs4
+   Building wheel for bs4 (setup.py) ... done
+   Stored in directory: /home/cmendezc/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
+ Successfully built bs4
+ Installing collected packages: soupsieve, beautifulsoup4, bs4
+ Successfully installed beautifulsoup4-4.11.1 bs4-0.0.1 soupsieve-2.3.2.post1
+ ```
+ Error!
+ ```shell
+ (pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ python fetch_pdfs.py -pmf example_pmf.tsv
+ Output directory of fetched_pdfs did not exist.  Created the directory.
+ Trying to fetch pmid 27547345
+ ** fetching of reprint 27547345 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+ Trying to fetch pmid 22610656
+ ** fetching of reprint 22610656 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+ Trying to fetch pmid 23858657
+ ** fetching of reprint 23858657 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+ Trying to fetch pmid 24998529
+ ** fetching of reprint 24998529 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+ Trying to fetch pmid 27859194
+ ** fetching of reprint 27859194 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+ Trying to fetch pmid 26991916
+ ** fetching of reprint 26991916 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+ Trying to fetch pmid 26742956
+ ** fetching of reprint 26742956 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+ Trying to fetch pmid 28388874
+ ** fetching of reprint 28388874 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+ ```
+ Fix 2: Install 
+ ```shell
+ (pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ pip install lxml
+ Collecting lxml
+   Downloading https://files.pythonhosted.org/packages/4b/24/300d0fd5130cf55e5bbab2c53d339728370cb4ac12ca80a4f421c2e228eb/lxml-4.9.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (5.8MB)
+      |████████████████████████████████| 5.8MB 2.7MB/s 
+ Installing collected packages: lxml
+ Successfully installed lxml-4.9.2
+ ```
+ It runs, but it didn't fetch all files. See unfetch_pmids.tsv
+ 
+ ## Run
+ ```shell
+ (base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda activate pubmed-batch-downloader-py3
+ (pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ cd /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/extraction-literature/input/pdfs
+ (pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/gitlab-automatic-extraction-growth-conditions/extraction-literature/input/pdfs$ python /home/cmendezc/Documents/ccg/github-Pubmed-Batch-Download/fetch_pdfs.py -pmf ../list_of_PMIDs.txt
+ Output directory of fetched_pdfs did not exist.  Created the directory.
+ Trying to fetch pmid 21097887
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ ** fetching reprint using the 'pubmed central' finder...
+ ** fetching of reprint 21097887 succeeded
+ Trying to fetch pmid 23818864
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ Trying acsPublications
+ Trying uchicagoPress
+ Trying nejm
+ Trying futureMedicine
+ Trying science_direct
+ ** fetching of reprint 23818864 failed from error list index out of range
+ Trying to fetch pmid 24947454
+ Trying genericCitationLabelled
+ ** fetching reprint using the 'generic citation labelled' finder...
+ ** fetching of reprint 24947454 succeeded
+ Trying to fetch pmid 25222563
+ Trying genericCitationLabelled
+ ** fetching reprint using the 'generic citation labelled' finder...
+ ** fetching of reprint 25222563 succeeded
+ Trying to fetch pmid 25275371
+ Trying genericCitationLabelled
+ ** fetching reprint using the 'generic citation labelled' finder...
+ ** fetching of reprint 25275371 succeeded
+ Trying to fetch pmid 25735747
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ ** fetching reprint using the 'pubmed central' finder...
+ ** fetching of reprint 25735747 succeeded
+ Trying to fetch pmid 26258987
+ Trying genericCitationLabelled
+ ** fetching reprint using the 'generic citation labelled' finder...
+ ** fetching of reprint 26258987 succeeded
+ Trying to fetch pmid 26279566
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ Trying acsPublications
+ Trying uchicagoPress
+ Trying nejm
+ Trying futureMedicine
+ Trying science_direct
+ Trying direct_pdf_link
+ ** Reprint 26279566 could not be fetched with the current finders.
+ Trying to fetch pmid 26670385
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ Trying acsPublications
+ Trying uchicagoPress
+ Trying nejm
+ Trying futureMedicine
+ Trying science_direct
+ ** fetching of reprint 26670385 failed from error Invalid URL 'f680gMuZlnwT4304lwsG531xpi4vbk83nDntGZ4l27M-1672941516-0-AdC4bMbcOc9cCSwDp4lsqirHOW3zv1msNUw8lijeZIxdN3BDTBUy983qf_LAfBVEhkt4k2Xwu_NYMJeaq3oG4LTyfDBxN2Ra-cmmTDVIK66GtTB9oyyn4GqMem1PTBwVEFtzEYcB4AOoR8EGbwWrEZa1jPMupBq_gJ0JlxuIGbBJw3SuuioKmRlQT_TDXNREjT2Av3DHrrz6C008shr-pgrTtAoM5aZ0N4clcoBQ1FWX04MZm-nPOxI-2zbxcHUYXqV91lbH7iWkztZWPcv6-Q3ePiFD6_-C7pdY_Mf0Y670kOKyhoqlZ0m3PqPm64-37r-nzxrcd2Z0MWJUMC8Jx1b1OA1e53TJy62F2K5ws3U82zktr4gEDS11A13r8DIn1wRCEH2dk8jI02NQoIp3JBTvUixhiNkWib01Zl7l7iAFLOJtWlVbeUsOwCh6imfV5m-2No7-SiGaur5Ip6Zf3ACDki_CjXifHxtGVh1TbvnYsBeUdoaWV3TsXdGvF7AVr_ytXg4-JiIHhaZ-SdCzpe65bWZmvwrIpCfZOEBOC-gNTm3tq5h1_2iQzVTinGQonsXdwLCYSKQeZRQ-qEFf7y4PpesHamAXmw1OZlZJtKlFgXx9MoCBp0Irx8ChWyIo5RhSBoa9j1_JW8AX1x7KDY3UX32ItW7-a2Qw5IEL_FRS6cyXOg1FLeHlanntIl11kKWmXyJ86bsvEQBn2Q9-1kMvisDZaM0LrfNT9KcghdkLgdpzsDEf-B4_MKdkVAhkBQ': No schema supplied. Perhaps you meant http://f680gMuZlnwT4304lwsG531xpi4vbk83nDntGZ4l27M-1672941516-0-AdC4bMbcOc9cCSwDp4lsqirHOW3zv1msNUw8lijeZIxdN3BDTBUy983qf_LAfBVEhkt4k2Xwu_NYMJeaq3oG4LTyfDBxN2Ra-cmmTDVIK66GtTB9oyyn4GqMem1PTBwVEFtzEYcB4AOoR8EGbwWrEZa1jPMupBq_gJ0JlxuIGbBJw3SuuioKmRlQT_TDXNREjT2Av3DHrrz6C008shr-pgrTtAoM5aZ0N4clcoBQ1FWX04MZm-nPOxI-2zbxcHUYXqV91lbH7iWkztZWPcv6-Q3ePiFD6_-C7pdY_Mf0Y670kOKyhoqlZ0m3PqPm64-37r-nzxrcd2Z0MWJUMC8Jx1b1OA1e53TJy62F2K5ws3U82zktr4gEDS11A13r8DIn1wRCEH2dk8jI02NQoIp3JBTvUixhiNkWib01Zl7l7iAFLOJtWlVbeUsOwCh6imfV5m-2No7-SiGaur5Ip6Zf3ACDki_CjXifHxtGVh1TbvnYsBeUdoaWV3TsXdGvF7AVr_ytXg4-JiIHhaZ-SdCzpe65bWZmvwrIpCfZOEBOC-gNTm3tq5h1_2iQzVTinGQonsXdwLCYSKQeZRQ-qEFf7y4PpesHamAXmw1OZlZJtKlFgXx9MoCBp0Irx8ChWyIo5RhSBoa9j1_JW8AX1x7KDY3UX32ItW7-a2Qw5IEL_FRS6cyXOg1FLeHlanntIl11kKWmXyJ86bsvEQBn2Q9-1kMvisDZaM0LrfNT9KcghdkLgdpzsDEf-B4_MKdkVAhkBQ?
+ Trying to fetch pmid 26673755
+ Trying genericCitationLabelled
+ ** fetching reprint using the 'generic citation labelled' finder...
+ ** fetching of reprint 26673755 succeeded
+ Trying to fetch pmid 28061857
+ Trying genericCitationLabelled
+ ** fetching reprint using the 'generic citation labelled' finder...
+ ** fetching of reprint 28061857 succeeded
+ Trying to fetch pmid 28526842
+ Trying genericCitationLabelled
+ ** fetching reprint using the 'generic citation labelled' finder...
+ ** fetching of reprint 28526842 succeeded
+ Trying to fetch pmid 29394395
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ ** fetching reprint using the 'pubmed central' finder...
+ ** fetching of reprint 29394395 succeeded
+ Trying to fetch pmid 30137486
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ ** fetching reprint using the 'pubmed central' finder...
+ ** fetching of reprint 30137486 succeeded
+ Trying to fetch pmid 30389436
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ Trying acsPublications
+ Trying uchicagoPress
+ Trying nejm
+ Trying futureMedicine
+ Trying science_direct
+ Trying direct_pdf_link
+ ** Reprint 30389436 could not be fetched with the current finders.
+ Trying to fetch pmid 30420454
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ Trying acsPublications
+ Trying uchicagoPress
+ Trying nejm
+ Trying futureMedicine
+ Trying science_direct
+ ** fetching of reprint 30420454 failed from error Invalid URL 'Ldk64CcSIXJkCs8pPt2YjpLkZJhaKs_m0pmekzL5SOY-1672941544-0-AaBQfP_66yID5nRb-xUznbW2FVXAWeiOcKnTosB_FUSxeeNSucukyHbo7OxAcJttrfA4pKDGasC8MvQz6o0cFBZv0VU_RfYXUn7Z6iVg5eVp7n_O4P9Zzk0IiE132EMNR-Xn0_gEfYM8DMCX5lS4yEgrs9hwhdJIWzS6N3fsDsa3kDjIH9oELTaDEbTbFvUXEkx3212-4NJ6SwCvfUzhtolsD7xJoswFQHjNBFrmUgScEORQpIWTWxzHPvpGTxepQMUPuAEbgNNykNbdp9oyLMDwmUnIqU7hSmeCkYU1RWlbxh95rcgH-yvV9mm3RQnIXT3WfcUE9lM5crnbBcplVCA4jbLP7kk1tu_BFbh-6gstCr0B24gEE5zJ41WGxwTbABhAmK7aAeHbH7V55EBpLOQcpkYhWZNiMMbVsG314TM_tE9UGM8B99FmrWUqCqwMcsGwDDWK7B-uHcDD5nJxQhgV5SlMnS0IVE18Bdu4zqIzT3ZS2sgGf9Drti4P5Qkso3v1pW_fBzq-Mrd6_O7cvwF7FlRc95tOXSjjS0Woc70HGNBNd1kc0ZR9NuwV9TnvPRWbuoYu3HXz65DeWmbGaLOFdHAOUARr1fD9DL9LRDmeAHOGkYkplz9pSbWXR6vYkIInqFnvQKuwhnVOltaWa6_VG3BH0oc9T4xAZdH83DsG6eHtJlitVhH8Sx_PBfukG4x0S1qsmIWUPDwZhwUe55ly0I5ISELLL8Z3tAJpq3zrdyV6CbwOjF7-nPF7aRNuxg': No schema supplied. Perhaps you meant http://Ldk64CcSIXJkCs8pPt2YjpLkZJhaKs_m0pmekzL5SOY-1672941544-0-AaBQfP_66yID5nRb-xUznbW2FVXAWeiOcKnTosB_FUSxeeNSucukyHbo7OxAcJttrfA4pKDGasC8MvQz6o0cFBZv0VU_RfYXUn7Z6iVg5eVp7n_O4P9Zzk0IiE132EMNR-Xn0_gEfYM8DMCX5lS4yEgrs9hwhdJIWzS6N3fsDsa3kDjIH9oELTaDEbTbFvUXEkx3212-4NJ6SwCvfUzhtolsD7xJoswFQHjNBFrmUgScEORQpIWTWxzHPvpGTxepQMUPuAEbgNNykNbdp9oyLMDwmUnIqU7hSmeCkYU1RWlbxh95rcgH-yvV9mm3RQnIXT3WfcUE9lM5crnbBcplVCA4jbLP7kk1tu_BFbh-6gstCr0B24gEE5zJ41WGxwTbABhAmK7aAeHbH7V55EBpLOQcpkYhWZNiMMbVsG314TM_tE9UGM8B99FmrWUqCqwMcsGwDDWK7B-uHcDD5nJxQhgV5SlMnS0IVE18Bdu4zqIzT3ZS2sgGf9Drti4P5Qkso3v1pW_fBzq-Mrd6_O7cvwF7FlRc95tOXSjjS0Woc70HGNBNd1kc0ZR9NuwV9TnvPRWbuoYu3HXz65DeWmbGaLOFdHAOUARr1fD9DL9LRDmeAHOGkYkplz9pSbWXR6vYkIInqFnvQKuwhnVOltaWa6_VG3BH0oc9T4xAZdH83DsG6eHtJlitVhH8Sx_PBfukG4x0S1qsmIWUPDwZhwUe55ly0I5ISELLL8Z3tAJpq3zrdyV6CbwOjF7-nPF7aRNuxg?
+ Trying to fetch pmid 33172971
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ Trying acsPublications
+ Trying uchicagoPress
+ Trying nejm
+ Trying futureMedicine
+ Trying science_direct
+ ** fetching of reprint 33172971 failed from error Invalid URL 'cIp5BepthG7srw3ecaG_06Qhx8PrpoP6WpFwlAopfrE-1672941545-0-AXmWkR9H8dN8IxbsllbsHZ1SigvIVLyZ0euRPz15XW6nX_MsA3Y9dPoL0MovKxj_yUNiDnSYrVSmYNzVo-LEANJZq45ZpzDVv1GFU1qNu0PpI-0YxuWHz4dSudrD_soFz3LsTCtgLamU66ZDSsrVNGTaqqbajetYgnnhu4K-BeYnLmpOxzcMzYU9mgynHjFv0NnrdUU75kJPeOIRpgrUlqm8JRnkMq0SEvI2IPTDW0ToohbWs4bLvLX0GNKGVT_v5to_am4hEVPC9jmkfkkNOLoMmfbnZC-L2EGAKufwZgz17d89HWfaK61no8EW8y5ysZ5A9yTRfN__C_LpTG6FWw2HWyR9FgIvz799f4ysIoz52azp4a7w3G3AHCWdUBUDy6gabo_psIE4mu3dCHLcDzGNO148UT5wzxTfrQV3aatPAWjnaK6-Re0XOkABNINniMLfF6Ti-0WgY-cHyLH2RgKISy_89MeNrVJy22GToy2c_LQwZN3RT3M8M2TFXLXmi9xEE4Z_4kSRA_aRnvRjKJdMJfxhc-BYW1G-dn2SDAetNZZL7HcJW6cGlAIjNWQqTD9ieGfLxGJe0OCLysFkeY3XwRY5vTHQ-xVI-gGKBY0A9gS70DH5t_pS53fBTQZ1pK667ct-BCo5aysuQHLcXrlE9coo8k8vtQKrmQ5-Fxp2ZNV_MKLY5yqBj5yAWJI5b_O-Mp5TyE9Zzyte_cTXqYtO14DAr1ev8TwqZP3YNbunHBcvIO20uVjbNxc4m--ARi5MMuxcDg4Kvju8Dbf3YKM': No schema supplied. Perhaps you meant http://cIp5BepthG7srw3ecaG_06Qhx8PrpoP6WpFwlAopfrE-1672941545-0-AXmWkR9H8dN8IxbsllbsHZ1SigvIVLyZ0euRPz15XW6nX_MsA3Y9dPoL0MovKxj_yUNiDnSYrVSmYNzVo-LEANJZq45ZpzDVv1GFU1qNu0PpI-0YxuWHz4dSudrD_soFz3LsTCtgLamU66ZDSsrVNGTaqqbajetYgnnhu4K-BeYnLmpOxzcMzYU9mgynHjFv0NnrdUU75kJPeOIRpgrUlqm8JRnkMq0SEvI2IPTDW0ToohbWs4bLvLX0GNKGVT_v5to_am4hEVPC9jmkfkkNOLoMmfbnZC-L2EGAKufwZgz17d89HWfaK61no8EW8y5ysZ5A9yTRfN__C_LpTG6FWw2HWyR9FgIvz799f4ysIoz52azp4a7w3G3AHCWdUBUDy6gabo_psIE4mu3dCHLcDzGNO148UT5wzxTfrQV3aatPAWjnaK6-Re0XOkABNINniMLfF6Ti-0WgY-cHyLH2RgKISy_89MeNrVJy22GToy2c_LQwZN3RT3M8M2TFXLXmi9xEE4Z_4kSRA_aRnvRjKJdMJfxhc-BYW1G-dn2SDAetNZZL7HcJW6cGlAIjNWQqTD9ieGfLxGJe0OCLysFkeY3XwRY5vTHQ-xVI-gGKBY0A9gS70DH5t_pS53fBTQZ1pK667ct-BCo5aysuQHLcXrlE9coo8k8vtQKrmQ5-Fxp2ZNV_MKLY5yqBj5yAWJI5b_O-Mp5TyE9Zzyte_cTXqYtO14DAr1ev8TwqZP3YNbunHBcvIO20uVjbNxc4m--ARi5MMuxcDg4Kvju8Dbf3YKM?
+ Trying to fetch pmid 34428301
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ ** fetching reprint using the 'pubmed central' finder...
+ ** fetching of reprint 34428301 succeeded
+ Trying to fetch pmid 34791440
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ ** fetching reprint using the 'pubmed central' finder...
+ ** fetching of reprint 34791440 succeeded
+ Trying to fetch pmid 9140061
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ Trying acsPublications
+ Trying uchicagoPress
+ Trying nejm
+ Trying futureMedicine
+ Trying science_direct
+ ** fetching of reprint 9140061 failed from error Invalid URL 'jcZKP5wy_U5fZwQzMnXsls3TZuFKtLfM.TRt.Bh4d9k-1672941556-0-ARg_NFIhKoWkMSTkM9K0NNsEvXccNV6TpzvRoQ-2vYjK8XkBbOYBTbXDk7ayo2JehzHWQXv5Q_R2fac_6YNLMbXVLvOx_MPE2G55FZnUH5eyYoAVkc294_DbWF4BkOBr9bbRZ77KShHUYqJjAOi2O7mvSeGRhr8aCrq258YcVJ0FBdP8Q5tuy2CNWxi_udpInouGKC_Bnbb4D6LtrmOH2qchHRdKNei5ina55N2xPiH6jVDZ21jK0SkCSagtetSHnT7A-CfaFwqG5cz5lnOs1l1bBFEcOFdNNkmvz5yGZK-RR1-gynCmgS1ixfHapDjmCyogIfAxI1oumhPQoHCCg8-OqSgMSXHbgJdPWvc5L68Unmk5BAZNeFU2F_-xInoVtpYPwJNkeyldxj98PbHPAYg-SqmRtv0MyKm9qcEZJIULlfwTZ2ZGAm_uAwcQ7fW_O9VfUNBlbt2SohoYWfCtILAc2Imgon6vNbdisaxRkf70SZuD0G-Fj2SCsAYhkQrqPCdJAEEfWJ1QiddGb32kTSnXCoupFAWbX441Xj4nOj5OaRem_6JScd2AJp-YxSNI0Nm4IrB8s5O_lG1o_BDYlplFwbKozatP9ckn0jeXx38wInIuKOjUgl9B_T2Xvkg6sCNxXUWsHXiHMkhQ3x2AEh47zf4T6vQoTi0wNMkUVtkNTh8gOviKKl74Pi4m3yyq1ICnA9L9D6E6MLuE_ZOfmVBM79sVEgN8jsDBojevmYv96r09rQaQ_9c5cFZk7E25jQ': No schema supplied. Perhaps you meant http://jcZKP5wy_U5fZwQzMnXsls3TZuFKtLfM.TRt.Bh4d9k-1672941556-0-ARg_NFIhKoWkMSTkM9K0NNsEvXccNV6TpzvRoQ-2vYjK8XkBbOYBTbXDk7ayo2JehzHWQXv5Q_R2fac_6YNLMbXVLvOx_MPE2G55FZnUH5eyYoAVkc294_DbWF4BkOBr9bbRZ77KShHUYqJjAOi2O7mvSeGRhr8aCrq258YcVJ0FBdP8Q5tuy2CNWxi_udpInouGKC_Bnbb4D6LtrmOH2qchHRdKNei5ina55N2xPiH6jVDZ21jK0SkCSagtetSHnT7A-CfaFwqG5cz5lnOs1l1bBFEcOFdNNkmvz5yGZK-RR1-gynCmgS1ixfHapDjmCyogIfAxI1oumhPQoHCCg8-OqSgMSXHbgJdPWvc5L68Unmk5BAZNeFU2F_-xInoVtpYPwJNkeyldxj98PbHPAYg-SqmRtv0MyKm9qcEZJIULlfwTZ2ZGAm_uAwcQ7fW_O9VfUNBlbt2SohoYWfCtILAc2Imgon6vNbdisaxRkf70SZuD0G-Fj2SCsAYhkQrqPCdJAEEfWJ1QiddGb32kTSnXCoupFAWbX441Xj4nOj5OaRem_6JScd2AJp-YxSNI0Nm4IrB8s5O_lG1o_BDYlplFwbKozatP9ckn0jeXx38wInIuKOjUgl9B_T2Xvkg6sCNxXUWsHXiHMkhQ3x2AEh47zf4T6vQoTi0wNMkUVtkNTh8gOviKKl74Pi4m3yyq1ICnA9L9D6E6MLuE_ZOfmVBM79sVEgN8jsDBojevmYv96r09rQaQ_9c5cFZk7E25jQ?
+ Trying to fetch pmid 32662815
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ ** fetching reprint using the 'pubmed central' finder...
+ ** fetching of reprint 32662815 succeeded
+ Trying to fetch pmid 32817380
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ Trying acsPublications
+ Trying uchicagoPress
+ Trying nejm
+ Trying futureMedicine
+ Trying science_direct
+ ** fetching of reprint 32817380 failed from error Invalid URL '12Kdq1Gu6s3H_c9wV7MWTwYP1d4sz7FRpWv08o2fehs-1672941561-0-AQzKoSKCDX92o3mW4XorQJa2qF1s8dsMn24r1239tDd-OxIEJ-xofWlfZb7cmDWmkZ-d4uCYOdMgyimJ9BwqBkuJKbRguJ_HaG4KzuT0CwTAflqmSgiP6oaZRbxRIMOl3LAnhQXawFffYKLbyEKG9hEWBeEbs31LlzwG7k7IbodBBPNfYicYC2QJy8RZ5xHWPTXxcwshhdG__QByEK9fJ6RYaR8LVhOwXo-m6nKcnmcvdFAubYorAvVvggpPCiIA0EYouK_-KA_Et9mXMtoRPVhEKeO03k9LAejSpvDDd8praPe4uYMGyBe4ruFtFbjqOdJgmlwSt_hPsHu_iFLkl6eW-V_dW5iwEQOE9z1jSjKf1ZHznUnde5Nzlh3v0wV2po1Y1QuFKuy8_IO-DB4iU3MlzHKgWqCsAeLorSaui7KqJAGzqmM3Keurq7J4URVd8khAGmHXMZHt3u96krRlFp3Nsc1_jwJEKKLxr44FVFla7XnqlQIHXdzj9FffjdPd1R_p3G-UEYGLzL32dFulkql4INTbOR625BrjoAvw74XDQRcNE_P72PYyCRUSIarPTtFTQBMSfpxRaOprcTZfMR_U5zdY0uGixU3srbPeduCUA7tQOFiCiLoTD_odsa75NYCv9o_me1vJSA823Md4hCV947suGwjybNaQP-R-yrffAfni7dQRYMt-mjEHk5LtnebhpJi1G44UN_WFSDpOkB6lvKO7Qc-eoUXnm4DbeysVDTRAmVi94HkcG9tc1U7BeSVyXNUfq3C1Vr_1jJXCgUI': No schema supplied. Perhaps you meant http://12Kdq1Gu6s3H_c9wV7MWTwYP1d4sz7FRpWv08o2fehs-1672941561-0-AQzKoSKCDX92o3mW4XorQJa2qF1s8dsMn24r1239tDd-OxIEJ-xofWlfZb7cmDWmkZ-d4uCYOdMgyimJ9BwqBkuJKbRguJ_HaG4KzuT0CwTAflqmSgiP6oaZRbxRIMOl3LAnhQXawFffYKLbyEKG9hEWBeEbs31LlzwG7k7IbodBBPNfYicYC2QJy8RZ5xHWPTXxcwshhdG__QByEK9fJ6RYaR8LVhOwXo-m6nKcnmcvdFAubYorAvVvggpPCiIA0EYouK_-KA_Et9mXMtoRPVhEKeO03k9LAejSpvDDd8praPe4uYMGyBe4ruFtFbjqOdJgmlwSt_hPsHu_iFLkl6eW-V_dW5iwEQOE9z1jSjKf1ZHznUnde5Nzlh3v0wV2po1Y1QuFKuy8_IO-DB4iU3MlzHKgWqCsAeLorSaui7KqJAGzqmM3Keurq7J4URVd8khAGmHXMZHt3u96krRlFp3Nsc1_jwJEKKLxr44FVFla7XnqlQIHXdzj9FffjdPd1R_p3G-UEYGLzL32dFulkql4INTbOR625BrjoAvw74XDQRcNE_P72PYyCRUSIarPTtFTQBMSfpxRaOprcTZfMR_U5zdY0uGixU3srbPeduCUA7tQOFiCiLoTD_odsa75NYCv9o_me1vJSA823Md4hCV947suGwjybNaQP-R-yrffAfni7dQRYMt-mjEHk5LtnebhpJi1G44UN_WFSDpOkB6lvKO7Qc-eoUXnm4DbeysVDTRAmVi94HkcG9tc1U7BeSVyXNUfq3C1Vr_1jJXCgUI?
+ Trying to fetch pmid 32849447
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ ** fetching reprint using the 'pubmed central' finder...
+ ** fetching of reprint 32849447 succeeded
+ Trying to fetch pmid 33068046
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ Trying acsPublications
+ Trying uchicagoPress
+ Trying nejm
+ Trying futureMedicine
+ Trying science_direct
+ ** fetching of reprint 33068046 failed from error Invalid URL 'Ca2tz3FARdmZBsByuyNuWeiql2uee1VreT.kVjY7yrk-1672941567-0-AexEfZqeSZGpnAKuvb4N24mFbbARpMqS4Bl7rq2oJaJLQy4XNqEXY1SvQ53OXwzuh9s8hJpJSmZKZ90s8So4WTMitRZFt0iwKRvwq5PfF8ZF-spmYvUyZmqSAcRty7hyAnlIItHCbvd0DXymu2foqGLiY7_Azyn4oIZjqDWZgwUu4cttCsPTlTJtscKhrnIDiTC2AD-6BrcAHq2eFMQXn27imPIx1RCRlJshGeDr1vbtfjlBg89wEfvUQMpUEgz-xVlFP2tkES_AqE3RIqDBCDIDkDuwxhKZ5d-k_PxAuN3Vbx-1nlLI7WeIZH3b-qHkPWg8ifOx6RsMU_A02ZEHMrjlftm66SFQ60Wsria5dpTeLxvGd34BBngLodgDKaYoG0ztHkPImcz4lT76J7-QCgKcV7O86u_4mEpHhONMbCRBLtVhcFVAX-zAMIyOWzECJ6x0Sau9cAqssr2l_Q1VT-f4uCaFA5KpmuC3IHUZQABkrvM9nh0uOhB2e7ln9OfxBG89KhjhGPRhio2LRDY4yprcBdzS-dNl1pedPEXENepuOg0R645bq0poGP4uKeYHuQ': No schema supplied. Perhaps you meant http://Ca2tz3FARdmZBsByuyNuWeiql2uee1VreT.kVjY7yrk-1672941567-0-AexEfZqeSZGpnAKuvb4N24mFbbARpMqS4Bl7rq2oJaJLQy4XNqEXY1SvQ53OXwzuh9s8hJpJSmZKZ90s8So4WTMitRZFt0iwKRvwq5PfF8ZF-spmYvUyZmqSAcRty7hyAnlIItHCbvd0DXymu2foqGLiY7_Azyn4oIZjqDWZgwUu4cttCsPTlTJtscKhrnIDiTC2AD-6BrcAHq2eFMQXn27imPIx1RCRlJshGeDr1vbtfjlBg89wEfvUQMpUEgz-xVlFP2tkES_AqE3RIqDBCDIDkDuwxhKZ5d-k_PxAuN3Vbx-1nlLI7WeIZH3b-qHkPWg8ifOx6RsMU_A02ZEHMrjlftm66SFQ60Wsria5dpTeLxvGd34BBngLodgDKaYoG0ztHkPImcz4lT76J7-QCgKcV7O86u_4mEpHhONMbCRBLtVhcFVAX-zAMIyOWzECJ6x0Sau9cAqssr2l_Q1VT-f4uCaFA5KpmuC3IHUZQABkrvM9nh0uOhB2e7ln9OfxBG89KhjhGPRhio2LRDY4yprcBdzS-dNl1pedPEXENepuOg0R645bq0poGP4uKeYHuQ?
+ Trying to fetch pmid 33072717
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ ** fetching reprint using the 'pubmed central' finder...
+ ** fetching of reprint 33072717 succeeded
+ Trying to fetch pmid 33136147
+ Trying genericCitationLabelled
+ ** fetching reprint using the 'generic citation labelled' finder...
+ ** fetching of reprint 33136147 succeeded
+ Trying to fetch pmid 33318048
+ Trying genericCitationLabelled
+ Trying pubmed_central_v2
+ Trying acsPublications
+ Trying uchicagoPress
+ Trying nejm
+ Trying futureMedicine
+ Trying science_direct
+ ** fetching of reprint 33318048 failed from error Invalid URL 'H_HDOMjPFqBRBn9CIvFp3.MLyt6.Cr1yqjPqy7_.dNo-1672941585-0-ASGJyet_JMjh-n9RCjZP1usTaU-rAh_oVPlNBV8Ox06oZLjLmr4nLazOPGibTSzbDun4wRRfxjJD1cl8pFLvWgZNLCwgScfdMEuTEYcHelG8wh84ZPO7-PimWyY4a-Ax_JW2wfMsWOrdcFRmKRfdjpL4MFDyEGVMcjhzP9y84LW4EnDyNZqVSkX_y8VAxIbmaMeuS-EiSakyeV1RnV4_bKjzzuQXXLtk8wIhc-rF2VoAiFgTRP3-kR7Y02rLN1opo-OYhoQ29Xy2fAHIKm2pS-qBW0XNRWiOU6q8_YMmMZrWbskiukxzgyZO5MutUF8ygYDuzaZDjX0BtuezjJEtcKWslPbaM1gXj1L8Yy3U7YCwi-_CPUjNOrvFnW0EEm5jeKDUVwwIeY1-sd54wUjlnn86c6qAqpaI4unKjLk4makfoIUlKr4B62VwsTRnrfbZxbqDTyl5jZjFIGiHmrmzPXxt1QG7SrQwApYoGQYFiijEUTw-7IM3t7bXcwRYTMVfbXUEhv8JrzvShSa8x1fDEwHgU2fUnY6BoCOrpC9hZShy1xlZSOOpw4AHgCW272GoFIr2PJ-Zy1UNze2TXebaUegtUpleiM-BhsDhqCAaGxAj9SQsD153z7wtiM6kCMnHOz9IhaKIkgYpKYgXwcQmuzLZUWgWJFJ0lqYeSgvlKAgHjzRY_3Jt2gPT3L2GcgUZQXWWRx4Hs4jL2tUvAiOuqPfvPWSFVjGTZPjZCd3VVFrqpcZCh2v85PiksdgNk05aMA': No schema supplied. Perhaps you meant http://H_HDOMjPFqBRBn9CIvFp3.MLyt6.Cr1yqjPqy7_.dNo-1672941585-0-ASGJyet_JMjh-n9RCjZP1usTaU-rAh_oVPlNBV8Ox06oZLjLmr4nLazOPGibTSzbDun4wRRfxjJD1cl8pFLvWgZNLCwgScfdMEuTEYcHelG8wh84ZPO7-PimWyY4a-Ax_JW2wfMsWOrdcFRmKRfdjpL4MFDyEGVMcjhzP9y84LW4EnDyNZqVSkX_y8VAxIbmaMeuS-EiSakyeV1RnV4_bKjzzuQXXLtk8wIhc-rF2VoAiFgTRP3-kR7Y02rLN1opo-OYhoQ29Xy2fAHIKm2pS-qBW0XNRWiOU6q8_YMmMZrWbskiukxzgyZO5MutUF8ygYDuzaZDjX0BtuezjJEtcKWslPbaM1gXj1L8Yy3U7YCwi-_CPUjNOrvFnW0EEm5jeKDUVwwIeY1-sd54wUjlnn86c6qAqpaI4unKjLk4makfoIUlKr4B62VwsTRnrfbZxbqDTyl5jZjFIGiHmrmzPXxt1QG7SrQwApYoGQYFiijEUTw-7IM3t7bXcwRYTMVfbXUEhv8JrzvShSa8x1fDEwHgU2fUnY6BoCOrpC9hZShy1xlZSOOpw4AHgCW272GoFIr2PJ-Zy1UNze2TXebaUegtUpleiM-BhsDhqCAaGxAj9SQsD153z7wtiM6kCMnHOz9IhaKIkgYpKYgXwcQmuzLZUWgWJFJ0lqYeSgvlKAgHjzRY_3Jt2gPT3L2GcgUZQXWWRx4Hs4jL2tUvAiOuqPfvPWSFVjGTZPjZCd3VVFrqpcZCh2v85PiksdgNk05aMA?
+ ```
+ 
+ # Text extraction from PDF
+ We sent PDf files to Lisen&Curate team for extracting text.
+ 
--- a/extraction-literature/input/list_of_PMIDs.txt 0 → 100644
View file @12e7dd6
+++ b/extraction-literature/input/list_of_PMIDs.txt 0 → 100644
View file @12e7dd6
+ 21097887
+ 23818864
+ 24947454
+ 25222563
+ 25275371
+ 25735747
+ 26258987
+ 26279566
+ 26670385
+ 26673755
+ 28061857
+ 28526842
+ 29394395
+ 30137486
+ 30389436
+ 30420454
+ 33172971
+ 34428301
+ 34791440
+ 9140061
+ 32662815
+ 32817380
+ 32849447
+ 33068046
+ 33072717
+ 33136147
+ 33318048