Gathering input document collection

cmendezc
Commit 12e7dd61a84416d65e0e2b0e7b0b3468984190ec 12e7dd61 1 parent 92c2d61e
Showing 4 changed files with 808 additions and 0 deletions
data-sets/bin/extract-sentences-from-softfiles_v1.py
data-sets/bin/extract-sentences-from-softfiles_v2.py
extraction-literature/input/README.md
extraction-literature/input/list_of_PMIDs.txt
--- a/data-sets/bin/extract-sentences-from-softfiles_v1.py 0 → 100644
View file @12e7dd6
+++ b/data-sets/bin/extract-sentences-from-softfiles_v1.py 0 → 100644
View file @12e7dd6
+import stanza
+import argparse
+import re
+import os
+import pandas as pd
+
+# Objective
+# Sentences extraction from XML Soft files.
+#
+# Input parameters
+# --inputPath=PATH    		Path to XML Soft files
+# --outputPath=PATH   		Path to place output files
+#
+# Output
+# Files with sentences obtained from XML Soft files
+#
+# Examples
+# python extract-sentences-from-softfiles.py
+# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
+# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+#
+# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+
+##########################################
+#               MAIN PROGRAM             #
+##########################################
+
+if __name__ == "__main__":
+    # Defining parameters
+    parser = argparse.ArgumentParser(
+        prog='extract-sentences-from-softfiles',
+        description='Sentences extraction from XML Soft files.',
+        epilog='')
+    parser.add_argument("--inputPath", dest="inputPath",
+                        help="Path to XML Soft files", metavar="PATH")
+    parser.add_argument("--outputPath", dest="outputPath",
+                        help="Path for output files", metavar="PATH")
+
+    args = parser.parse_args()
+
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to XML Soft files: " + args.inputPath)
+    print("Path to output files: " + args.outputPath)
+    print('-------------------------------- PROCESSING --------------------------------')
+
+    ## Tags of GCs into consideration
+    # culture medium, medium supplements, aeration, temperature,
+    # pH, agitation, growth phase, optical density, genetic background
+    tags = {
+        '<Gtype>': 'Gtype',
+        # '<Gversion>': 'Gversion',
+        '<Med>': 'Med',
+        '<Phase>': 'Phase',
+        # '<Substrain>': 'Substrain',
+        '<Supp>': 'Supp',
+        # '<Strain>': 'Strain',
+        # '<Technique>': 'Technique',
+        '<Temp>': 'Temp',
+        '<OD>': 'OD',
+        '<Anti>': 'Anti',
+        '<Agit>': 'Agit',
+        '<Air>': 'Air',
+        '<Vess>': 'Vess',
+        '<pH>': 'pH'
+    }
+    #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
+    #        '<Temp>', '<OD>', '<Anti>', '<Agit>',
+    #        '<Air>', '<Vess>', '<pH>']
+    #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
+    tags = ['Gtype', 'Med', 'Phase', 'Supp',
+            'Temp', 'OD', 'Anti', 'Agit',
+            'Air', 'Vess', 'pH']
+    deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique']
+    all_tags = tags + deleted_tags
+    # Regex to check if line has a tag
+    regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
+    # Regex to delete tags
+    regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
+    # Regex to substitute tags
+    regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
+    regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
+    #p = re.compile(r'blue (?P<animal>dog|cat)')
+    #p.sub(r'gray \g<animal>', s)
+    # Regex to tag GCs
+    regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
+    regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
+
+    # Testing file: GSE54899_family_retagged-05242019_validated.xml
+    testing_file = "GSE54899_family_retagged-05242019_validated.xml"
+
+    # Define stanza pipeline for sentence segmentation
+    nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
+    # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
+    nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
+
+    # Store field_name (bangline) and field_text
+    field_name = ""
+    field_text = ""
+
+    # Store list of unique field_name
+    hash_field_name = {}
+
+    # Store sentences from fields that contained at least one GC tag.
+    #   We want to use this list for someone to check it
+    df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
+
+    # Store serie number
+    #   ^SERIES = GSE54899
+    serie = ""
+    # Store series pubmed id
+    #   !Series_pubmed_id = 25222563
+    serie_pubmed_id = ""
+    # Store sample
+    #   ^SAMPLE = GSM1326335
+    sample = ""
+
+    for path, dirs, files in os.walk(args.inputPath):
+        # For each file in dir
+        for file in files:
+            if file == testing_file:
+                print("   Reading file..." + str(file))
+                with open(os.path.join(args.inputPath, file)) as iFile:
+                    for line in iFile:
+                        line = line.rstrip('\n')
+                        if line.find(" = ") == -1:
+                            continue
+                        list_line = line.split(" = ")
+                        field_name = list_line[0]
+                        #print("field_name: {}".format(field_name))
+                        field_text = list_line[1]
+                        #print("field_text: {}".format(field_text))
+                        if field_name == "^SERIES":
+                            serie = field_text
+                        elif field_name == "!Series_pubmed_id":
+                            serie_pubmed_id = field_text
+                        elif field_name == "^SAMPLE":
+                            sample = field_text
+                        elif regex_has_tag.search(line):    # Contains GC tag
+                            if field_name in hash_field_name:
+                                hash_field_name[field_name] += 1
+                            else:
+                                hash_field_name[field_name] = 1
+                            original_sentence = field_text
+                            # delete GC tags
+                            modified_sentence = regex_delete_tag.sub("", field_text)
+                            modified_sentence = regex_delete_tag.sub("", modified_sentence)
+                            # substitute tags
+                            # p = re.compile(r'blue (?P<animal>dog|cat)')
+                            # p.sub(r'gray \g<animal>', s)
+                            modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
+                            modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
+                            doc = nlp(modified_sentence)
+                            for i, sentence in enumerate(doc.sentences):
+                                # print(sentence.text)
+                                list_transformed_sentence = []
+                                # For GC tag
+                                gc_tag = "O"
+                                in_tag = False
+                                for word in sentence.words:
+                                    result = regex_gc_ini_tag.match(word.text)
+                                    if result:
+                                        gc_tag = result.group("tag")
+                                        in_tag = True
+                                        continue
+                                    else:
+                                        result = regex_gc_end_tag.match(word.text)
+                                        if result:
+                                            gc_tag = "O"
+                                            in_tag = False
+                                            continue
+                                        else:
+                                            if not in_tag:
+                                                gc_tag = "O"
+                                    list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
+                                transformed_sentence = " ".join(list_transformed_sentence)
+                                new_row = {'serie': serie,
+                                           'serie_pubmed_id': serie_pubmed_id,
+                                           'sample': sample,
+                                           'field_name': field_name,
+                                           'original_sentence': original_sentence,
+                                           'modified_sentence': sentence.text,
+                                           'transformed_sentence': transformed_sentence}
+                                df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
+    df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
+                                    #print(token)
+    quit()
+
+    ## End of tagging
+    out_labels = {
+        '</Gtype>': 'O',
+        '</Gversion>': 'O',
+        '</Med>': 'O',
+        '</Phase>': 'O',
+        '</Substrain>': 'O',
+        '</Supp>': 'O',
+        '</Strain>': 'O',
+        '</Technique>': 'O',
+        '</Temp>': 'O',
+        '</OD>': 'O',
+        '</Anti>': 'O',
+        '</Agit>': 'O',
+        '</Air>': 'O',
+        '</Vess>': 'O',
+        '</pH>': 'O'}
+    old_labels = {
+        '<Orgn>': 'O',
+        '</Orgn>': 'O'
+    }
+
+    # Other label
+    flag = 'O'
+    lista = []
+    # First sentence
+    sentence = ''
+    n = 0
+    with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file:
+        for line in input_file:
+            if len(line.split('\t')) > 1:
+                w = line.split('\t')[1]
+                if w in in_labels or w in out_labels:
+                    # Tagging
+                    if w in in_labels.keys(): flag = in_labels[w]
+                    if w in out_labels: flag = out_labels[w]
+                else:
+                    if w == "PGCGROWTHCONDITIONS":
+                        n = n + 1
+                        words = sentence.split(' ')
+                        # End of sentence
+                        tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()]
+                        # At least one true-tag on sentence
+                        if len(tags) > 0:
+                            lista.append(sentence)
+                            # New setence
+                        sentence = ''
+                    elif w not in old_labels.keys():
+                        # Building and save tagging sentence
+                        sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ')
+
+    print("Number of sentences with at least one tag: " + str(len(lista)))
+    print("Number of sentences from CoreNLP: " + str(n))
+
+    # Split 70 30 training and test sentences
+    trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70))
+    testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
+    print("Number of sentences for training: " + str(len(trainingIndex)))
+    print("Number of sentences for test: " + str(len(testIndex)))
+
+    with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile:
+        Data = [lista[i] for i in trainingIndex]
+        oFile.write('\n'.join(Data))
+
+    with open(os.path.join(args.outputPath, args.testFile), "w") as oFile:
+        Data = [lista[i] for i in testIndex]
+        oFile.write('\n'.join(Data))
+
+    print("==================================END===================================")
--- a/data-sets/bin/extract-sentences-from-softfiles_v2.py 0 → 100644
View file @12e7dd6
+++ b/data-sets/bin/extract-sentences-from-softfiles_v2.py 0 → 100644
View file @12e7dd6
+import stanza
+import argparse
+import re
+import os
+import pandas as pd
+
+# Objective
+# Sentences extraction from XML Soft files.
+#
+# Input parameters
+# --inputPath=PATH    		Path to XML Soft files
+# --outputPath=PATH   		Path to place output files
+#
+# Output
+# Files with sentences obtained from XML Soft files
+#
+# Examples
+# python extract-sentences-from-softfiles.py
+# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
+# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+#
+# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+
+##########################################
+#               MAIN PROGRAM             #
+##########################################
+
+if __name__ == "__main__":
+    # Defining parameters
+    parser = argparse.ArgumentParser(
+        prog='extract-sentences-from-softfiles',
+        description='Sentences extraction from XML Soft files.',
+        epilog='')
+    parser.add_argument("--inputPath", dest="inputPath",
+                        help="Path to XML Soft files", metavar="PATH")
+    parser.add_argument("--outputPath", dest="outputPath",
+                        help="Path for output files", metavar="PATH")
+
+    args = parser.parse_args()
+
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to XML Soft files: " + args.inputPath)
+    print("Path to output files: " + args.outputPath)
+    print('-------------------------------- PROCESSING --------------------------------')
+
+    ## Tags of GCs into consideration
+    # culture medium, medium supplements, aeration, temperature,
+    # pH, agitation, growth phase, optical density, genetic background
+    tags = {
+        '<Gtype>': 'Gtype',
+        # '<Gversion>': 'Gversion',
+        '<Med>': 'Med',
+        '<Phase>': 'Phase',
+        # '<Substrain>': 'Substrain',
+        '<Supp>': 'Supp',
+        # '<Strain>': 'Strain',
+        # '<Technique>': 'Technique',
+        '<Temp>': 'Temp',
+        '<OD>': 'OD',
+        '<Anti>': 'Anti',
+        '<Agit>': 'Agit',
+        '<Air>': 'Air',
+        '<Vess>': 'Vess',
+        '<pH>': 'pH'
+    }
+    #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
+    #        '<Temp>', '<OD>', '<Anti>', '<Agit>',
+    #        '<Air>', '<Vess>', '<pH>']
+    #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
+    tags = ['Gtype', 'Med', 'Phase', 'Supp',
+            'Temp', 'OD', 'Anti', 'Agit',
+            'Air', 'Vess', 'pH']
+    deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique']
+    all_tags = tags + deleted_tags
+    # Regex to check if line has a tag
+    regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
+    # Regex to delete tags
+    regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
+    # Regex to substitute tags
+    regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
+    regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
+    #p = re.compile(r'blue (?P<animal>dog|cat)')
+    #p.sub(r'gray \g<animal>', s)
+    # Regex to tag GCs
+    regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
+    regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
+
+    # Testing file: GSE54899_family_retagged-05242019_validated.xml
+    testing_file = "GSE54899_family_retagged-05242019_validated.xml"
+
+    # Define stanza pipeline for sentence segmentation
+    nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
+    # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
+    nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
+
+    # Store field_name (bangline) and field_text
+    field_name = ""
+    field_text = ""
+
+    # Store list of unique field_name
+    hash_field_name = {}
+
+    # Store sentences from fields that contained at least one GC tag.
+    #   We want to use this list for someone to check it
+    df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
+
+    # Store serie number
+    #   ^SERIES = GSE54899
+    serie = ""
+    # Store series pubmed id
+    #   !Series_pubmed_id = 25222563
+    serie_pubmed_id = ""
+    # Store sample
+    #   ^SAMPLE = GSM1326335
+    sample = ""
+
+    for path, dirs, files in os.walk(args.inputPath):
+        # For each file in dir
+        for file in files:
+            if file == testing_file:
+                print("   Reading file..." + str(file))
+                with open(os.path.join(args.inputPath, file)) as iFile:
+                    for line in iFile:
+                        line = line.rstrip('\n')
+                        if line.find(" = ") == -1:
+                            continue
+                        list_line = line.split(" = ")
+                        field_name = list_line[0]
+                        #print("field_name: {}".format(field_name))
+                        field_text = list_line[1]
+                        #print("field_text: {}".format(field_text))
+                        if field_name == "^SERIES":
+                            serie = field_text
+                        elif field_name == "!Series_pubmed_id":
+                            serie_pubmed_id = field_text
+                        elif field_name == "^SAMPLE":
+                            sample = field_text
+                        elif regex_has_tag.search(line):    # Contains GC tag
+                            if field_name in hash_field_name:
+                                hash_field_name[field_name] += 1
+                            else:
+                                hash_field_name[field_name] = 1
+                            original_sentence = field_text
+                            # delete GC tags
+                            modified_sentence = regex_delete_tag.sub("", field_text)
+                            modified_sentence = regex_delete_tag.sub("", modified_sentence)
+                            # substitute tags
+                            # p = re.compile(r'blue (?P<animal>dog|cat)')
+                            # p.sub(r'gray \g<animal>', s)
+                            modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
+                            modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
+                            doc = nlp(modified_sentence)
+                            for i, sentence in enumerate(doc.sentences):
+                                # print(sentence.text)
+                                list_transformed_sentence = []
+                                # For GC tag
+                                gc_tag = "O"
+                                in_tag = False
+                                for word in sentence.words:
+                                    result = regex_gc_ini_tag.match(word.text)
+                                    if result:
+                                        gc_tag = result.group("tag")
+                                        in_tag = True
+                                        continue
+                                    else:
+                                        result = regex_gc_end_tag.match(word.text)
+                                        if result:
+                                            gc_tag = "O"
+                                            in_tag = False
+                                            continue
+                                        else:
+                                            if not in_tag:
+                                                gc_tag = "O"
+                                    list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
+                                transformed_sentence = " ".join(list_transformed_sentence)
+                                new_row = {'serie': serie,
+                                           'serie_pubmed_id': serie_pubmed_id,
+                                           'sample': sample,
+                                           'field_name': field_name,
+                                           'original_sentence': original_sentence,
+                                           'modified_sentence': sentence.text,
+                                           'transformed_sentence': transformed_sentence}
+                                df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
+    df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
+                                    #print(token)
+    quit()
+
+    ## End of tagging
+    out_labels = {
+        '</Gtype>': 'O',
+        '</Gversion>': 'O',
+        '</Med>': 'O',
+        '</Phase>': 'O',
+        '</Substrain>': 'O',
+        '</Supp>': 'O',
+        '</Strain>': 'O',
+        '</Technique>': 'O',
+        '</Temp>': 'O',
+        '</OD>': 'O',
+        '</Anti>': 'O',
+        '</Agit>': 'O',
+        '</Air>': 'O',
+        '</Vess>': 'O',
+        '</pH>': 'O'}
+    old_labels = {
+        '<Orgn>': 'O',
+        '</Orgn>': 'O'
+    }
+
+    # Other label
+    flag = 'O'
+    lista = []
+    # First sentence
+    sentence = ''
+    n = 0
+    with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file:
+        for line in input_file:
+            if len(line.split('\t')) > 1:
+                w = line.split('\t')[1]
+                if w in in_labels or w in out_labels:
+                    # Tagging
+                    if w in in_labels.keys(): flag = in_labels[w]
+                    if w in out_labels: flag = out_labels[w]
+                else:
+                    if w == "PGCGROWTHCONDITIONS":
+                        n = n + 1
+                        words = sentence.split(' ')
+                        # End of sentence
+                        tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()]
+                        # At least one true-tag on sentence
+                        if len(tags) > 0:
+                            lista.append(sentence)
+                            # New setence
+                        sentence = ''
+                    elif w not in old_labels.keys():
+                        # Building and save tagging sentence
+                        sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ')
+
+    print("Number of sentences with at least one tag: " + str(len(lista)))
+    print("Number of sentences from CoreNLP: " + str(n))
+
+    # Split 70 30 training and test sentences
+    trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70))
+    testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
+    print("Number of sentences for training: " + str(len(trainingIndex)))
+    print("Number of sentences for test: " + str(len(testIndex)))
+
+    with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile:
+        Data = [lista[i] for i in trainingIndex]
+        oFile.write('\n'.join(Data))
+
+    with open(os.path.join(args.outputPath, args.testFile), "w") as oFile:
+        Data = [lista[i] for i in testIndex]
+        oFile.write('\n'.join(Data))
+
+    print("==================================END===================================")
--- a/extraction-literature/input/README.md 0 → 100644
View file @12e7dd6
+++ b/extraction-literature/input/README.md 0 → 100644
View file @12e7dd6
+# Input article collection
+We used list of PMIDs from article collections delivered by curators (Víctor, Soco, Paloma).
+DRIVE (https://docs.google.com/spreadsheets/d/1OayfQ7ODgnU4d5PQ3SUAmFX3Tc27PocCHZ6flPXwLKc/edit?usp=sharing)
+Asana (https://app.asana.com/0/1200927210854847/1203428992254399/f)
+
+# Download PDFs
+We used [Pubmed-Batch-Download](https://github.com/billgreenwald/Pubmed-Batch-Download) tool to download PDF files. 
+## Installation
+```shell
+(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ git clone https://github.com/billgreenwald/Pubmed-Batch-Download.git
+Cloning into 'Pubmed-Batch-Download'...
+remote: Enumerating objects: 202, done.
+remote: Counting objects: 100% (12/12), done.
+remote: Compressing objects: 100% (12/12), done.
+remote: Total 202 (delta 5), reused 0 (delta 0), pack-reused 190
+Receiving objects: 100% (202/202), 31.23 MiB | 1.09 MiB/s, done.
+Resolving deltas: 100% (102/102), done.
+(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ mv Pubmed-Batch-Download/ github-Pubmed-Batch-Download
+(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg$ cd github-Pubmed-Batch-Download/
+(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ ls -l
+total 52
+-rw-rw-r-- 1 cmendezc cmendezc    72 ene  5 11:31 example_pmf.tsv
+-rw-rw-r-- 1 cmendezc cmendezc 11430 ene  5 11:31 fetch_pdfs.py
+-rw-rw-r-- 1 cmendezc cmendezc 18711 ene  5 11:31 fetch_pdfs_toScript.ipynb
+-rw-rw-r-- 1 cmendezc cmendezc   551 ene  5 11:31 pubmed-batch-downloader-py3-windows.yml
+-rw-rw-r-- 1 cmendezc cmendezc   895 ene  5 11:31 pubmed-batch-downloader-py3.yml
+-rw-rw-r-- 1 cmendezc cmendezc  3667 ene  5 11:31 README.md
+drwxrwxr-x 2 cmendezc cmendezc  4096 ene  5 11:31 ruby_version
+-rw-rw-r-- 1 cmendezc cmendezc     0 ene  5 11:31 unfetched_pmids.tsv
+(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda env create -f pubmed-batch-downloader-py3.yml
+```
+## Testing
+Error!
+```shell
+(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda activate pubmed-batch-downloader-py3
+(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ python fetch_pdfs.py -pmf example_pmf.tsv
+Traceback (most recent call last):
+  File "fetch_pdfs.py", line 64, in <module>
+    from bs4 import BeautifulSoup
+ModuleNotFoundError: No module named 'bs4'
+```
+Fix 1: Install bs4
+```shell
+(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ pip install bs4
+Collecting bs4
+  Using cached https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
+Collecting beautifulsoup4 (from bs4)
+  Using cached https://files.pythonhosted.org/packages/9c/d8/909c4089dbe4ade9f9705f143c9f13f065049a9d5e7d34c828aefdd0a97c/beautifulsoup4-4.11.1-py3-none-any.whl
+Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
+  Using cached https://files.pythonhosted.org/packages/16/e3/4ad79882b92617e3a4a0df1960d6bce08edfb637737ac5c3f3ba29022e25/soupsieve-2.3.2.post1-py3-none-any.whl
+Building wheels for collected packages: bs4
+  Building wheel for bs4 (setup.py) ... done
+  Stored in directory: /home/cmendezc/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
+Successfully built bs4
+Installing collected packages: soupsieve, beautifulsoup4, bs4
+Successfully installed beautifulsoup4-4.11.1 bs4-0.0.1 soupsieve-2.3.2.post1
+```
+Error!
+```shell
+(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ python fetch_pdfs.py -pmf example_pmf.tsv
+Output directory of fetched_pdfs did not exist.  Created the directory.
+Trying to fetch pmid 27547345
+** fetching of reprint 27547345 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+Trying to fetch pmid 22610656
+** fetching of reprint 22610656 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+Trying to fetch pmid 23858657
+** fetching of reprint 23858657 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+Trying to fetch pmid 24998529
+** fetching of reprint 24998529 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+Trying to fetch pmid 27859194
+** fetching of reprint 27859194 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+Trying to fetch pmid 26991916
+** fetching of reprint 26991916 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+Trying to fetch pmid 26742956
+** fetching of reprint 26742956 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+Trying to fetch pmid 28388874
+** fetching of reprint 28388874 failed from error Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
+```
+Fix 2: Install 
+```shell
+(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ pip install lxml
+Collecting lxml
+  Downloading https://files.pythonhosted.org/packages/4b/24/300d0fd5130cf55e5bbab2c53d339728370cb4ac12ca80a4f421c2e228eb/lxml-4.9.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (5.8MB)
+     |████████████████████████████████| 5.8MB 2.7MB/s 
+Installing collected packages: lxml
+Successfully installed lxml-4.9.2
+```
+It runs, but it didn't fetch all files. See unfetch_pmids.tsv
+
+## Run
+```shell
+(base) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ conda activate pubmed-batch-downloader-py3
+(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/github-Pubmed-Batch-Download$ cd /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/extraction-literature/input/pdfs
+(pubmed-batch-downloader-py3) cmendezc@cmendezc-Latitude-7400:~/Documents/ccg/gitlab-automatic-extraction-growth-conditions/extraction-literature/input/pdfs$ python /home/cmendezc/Documents/ccg/github-Pubmed-Batch-Download/fetch_pdfs.py -pmf ../list_of_PMIDs.txt
+Output directory of fetched_pdfs did not exist.  Created the directory.
+Trying to fetch pmid 21097887
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+** fetching reprint using the 'pubmed central' finder...
+** fetching of reprint 21097887 succeeded
+Trying to fetch pmid 23818864
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+Trying acsPublications
+Trying uchicagoPress
+Trying nejm
+Trying futureMedicine
+Trying science_direct
+** fetching of reprint 23818864 failed from error list index out of range
+Trying to fetch pmid 24947454
+Trying genericCitationLabelled
+** fetching reprint using the 'generic citation labelled' finder...
+** fetching of reprint 24947454 succeeded
+Trying to fetch pmid 25222563
+Trying genericCitationLabelled
+** fetching reprint using the 'generic citation labelled' finder...
+** fetching of reprint 25222563 succeeded
+Trying to fetch pmid 25275371
+Trying genericCitationLabelled
+** fetching reprint using the 'generic citation labelled' finder...
+** fetching of reprint 25275371 succeeded
+Trying to fetch pmid 25735747
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+** fetching reprint using the 'pubmed central' finder...
+** fetching of reprint 25735747 succeeded
+Trying to fetch pmid 26258987
+Trying genericCitationLabelled
+** fetching reprint using the 'generic citation labelled' finder...
+** fetching of reprint 26258987 succeeded
+Trying to fetch pmid 26279566
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+Trying acsPublications
+Trying uchicagoPress
+Trying nejm
+Trying futureMedicine
+Trying science_direct
+Trying direct_pdf_link
+** Reprint 26279566 could not be fetched with the current finders.
+Trying to fetch pmid 26670385
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+Trying acsPublications
+Trying uchicagoPress
+Trying nejm
+Trying futureMedicine
+Trying science_direct
+** fetching of reprint 26670385 failed from error Invalid URL 'f680gMuZlnwT4304lwsG531xpi4vbk83nDntGZ4l27M-1672941516-0-AdC4bMbcOc9cCSwDp4lsqirHOW3zv1msNUw8lijeZIxdN3BDTBUy983qf_LAfBVEhkt4k2Xwu_NYMJeaq3oG4LTyfDBxN2Ra-cmmTDVIK66GtTB9oyyn4GqMem1PTBwVEFtzEYcB4AOoR8EGbwWrEZa1jPMupBq_gJ0JlxuIGbBJw3SuuioKmRlQT_TDXNREjT2Av3DHrrz6C008shr-pgrTtAoM5aZ0N4clcoBQ1FWX04MZm-nPOxI-2zbxcHUYXqV91lbH7iWkztZWPcv6-Q3ePiFD6_-C7pdY_Mf0Y670kOKyhoqlZ0m3PqPm64-37r-nzxrcd2Z0MWJUMC8Jx1b1OA1e53TJy62F2K5ws3U82zktr4gEDS11A13r8DIn1wRCEH2dk8jI02NQoIp3JBTvUixhiNkWib01Zl7l7iAFLOJtWlVbeUsOwCh6imfV5m-2No7-SiGaur5Ip6Zf3ACDki_CjXifHxtGVh1TbvnYsBeUdoaWV3TsXdGvF7AVr_ytXg4-JiIHhaZ-SdCzpe65bWZmvwrIpCfZOEBOC-gNTm3tq5h1_2iQzVTinGQonsXdwLCYSKQeZRQ-qEFf7y4PpesHamAXmw1OZlZJtKlFgXx9MoCBp0Irx8ChWyIo5RhSBoa9j1_JW8AX1x7KDY3UX32ItW7-a2Qw5IEL_FRS6cyXOg1FLeHlanntIl11kKWmXyJ86bsvEQBn2Q9-1kMvisDZaM0LrfNT9KcghdkLgdpzsDEf-B4_MKdkVAhkBQ': No schema supplied. Perhaps you meant http://f680gMuZlnwT4304lwsG531xpi4vbk83nDntGZ4l27M-1672941516-0-AdC4bMbcOc9cCSwDp4lsqirHOW3zv1msNUw8lijeZIxdN3BDTBUy983qf_LAfBVEhkt4k2Xwu_NYMJeaq3oG4LTyfDBxN2Ra-cmmTDVIK66GtTB9oyyn4GqMem1PTBwVEFtzEYcB4AOoR8EGbwWrEZa1jPMupBq_gJ0JlxuIGbBJw3SuuioKmRlQT_TDXNREjT2Av3DHrrz6C008shr-pgrTtAoM5aZ0N4clcoBQ1FWX04MZm-nPOxI-2zbxcHUYXqV91lbH7iWkztZWPcv6-Q3ePiFD6_-C7pdY_Mf0Y670kOKyhoqlZ0m3PqPm64-37r-nzxrcd2Z0MWJUMC8Jx1b1OA1e53TJy62F2K5ws3U82zktr4gEDS11A13r8DIn1wRCEH2dk8jI02NQoIp3JBTvUixhiNkWib01Zl7l7iAFLOJtWlVbeUsOwCh6imfV5m-2No7-SiGaur5Ip6Zf3ACDki_CjXifHxtGVh1TbvnYsBeUdoaWV3TsXdGvF7AVr_ytXg4-JiIHhaZ-SdCzpe65bWZmvwrIpCfZOEBOC-gNTm3tq5h1_2iQzVTinGQonsXdwLCYSKQeZRQ-qEFf7y4PpesHamAXmw1OZlZJtKlFgXx9MoCBp0Irx8ChWyIo5RhSBoa9j1_JW8AX1x7KDY3UX32ItW7-a2Qw5IEL_FRS6cyXOg1FLeHlanntIl11kKWmXyJ86bsvEQBn2Q9-1kMvisDZaM0LrfNT9KcghdkLgdpzsDEf-B4_MKdkVAhkBQ?
+Trying to fetch pmid 26673755
+Trying genericCitationLabelled
+** fetching reprint using the 'generic citation labelled' finder...
+** fetching of reprint 26673755 succeeded
+Trying to fetch pmid 28061857
+Trying genericCitationLabelled
+** fetching reprint using the 'generic citation labelled' finder...
+** fetching of reprint 28061857 succeeded
+Trying to fetch pmid 28526842
+Trying genericCitationLabelled
+** fetching reprint using the 'generic citation labelled' finder...
+** fetching of reprint 28526842 succeeded
+Trying to fetch pmid 29394395
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+** fetching reprint using the 'pubmed central' finder...
+** fetching of reprint 29394395 succeeded
+Trying to fetch pmid 30137486
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+** fetching reprint using the 'pubmed central' finder...
+** fetching of reprint 30137486 succeeded
+Trying to fetch pmid 30389436
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+Trying acsPublications
+Trying uchicagoPress
+Trying nejm
+Trying futureMedicine
+Trying science_direct
+Trying direct_pdf_link
+** Reprint 30389436 could not be fetched with the current finders.
+Trying to fetch pmid 30420454
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+Trying acsPublications
+Trying uchicagoPress
+Trying nejm
+Trying futureMedicine
+Trying science_direct
+** fetching of reprint 30420454 failed from error Invalid URL 'Ldk64CcSIXJkCs8pPt2YjpLkZJhaKs_m0pmekzL5SOY-1672941544-0-AaBQfP_66yID5nRb-xUznbW2FVXAWeiOcKnTosB_FUSxeeNSucukyHbo7OxAcJttrfA4pKDGasC8MvQz6o0cFBZv0VU_RfYXUn7Z6iVg5eVp7n_O4P9Zzk0IiE132EMNR-Xn0_gEfYM8DMCX5lS4yEgrs9hwhdJIWzS6N3fsDsa3kDjIH9oELTaDEbTbFvUXEkx3212-4NJ6SwCvfUzhtolsD7xJoswFQHjNBFrmUgScEORQpIWTWxzHPvpGTxepQMUPuAEbgNNykNbdp9oyLMDwmUnIqU7hSmeCkYU1RWlbxh95rcgH-yvV9mm3RQnIXT3WfcUE9lM5crnbBcplVCA4jbLP7kk1tu_BFbh-6gstCr0B24gEE5zJ41WGxwTbABhAmK7aAeHbH7V55EBpLOQcpkYhWZNiMMbVsG314TM_tE9UGM8B99FmrWUqCqwMcsGwDDWK7B-uHcDD5nJxQhgV5SlMnS0IVE18Bdu4zqIzT3ZS2sgGf9Drti4P5Qkso3v1pW_fBzq-Mrd6_O7cvwF7FlRc95tOXSjjS0Woc70HGNBNd1kc0ZR9NuwV9TnvPRWbuoYu3HXz65DeWmbGaLOFdHAOUARr1fD9DL9LRDmeAHOGkYkplz9pSbWXR6vYkIInqFnvQKuwhnVOltaWa6_VG3BH0oc9T4xAZdH83DsG6eHtJlitVhH8Sx_PBfukG4x0S1qsmIWUPDwZhwUe55ly0I5ISELLL8Z3tAJpq3zrdyV6CbwOjF7-nPF7aRNuxg': No schema supplied. Perhaps you meant http://Ldk64CcSIXJkCs8pPt2YjpLkZJhaKs_m0pmekzL5SOY-1672941544-0-AaBQfP_66yID5nRb-xUznbW2FVXAWeiOcKnTosB_FUSxeeNSucukyHbo7OxAcJttrfA4pKDGasC8MvQz6o0cFBZv0VU_RfYXUn7Z6iVg5eVp7n_O4P9Zzk0IiE132EMNR-Xn0_gEfYM8DMCX5lS4yEgrs9hwhdJIWzS6N3fsDsa3kDjIH9oELTaDEbTbFvUXEkx3212-4NJ6SwCvfUzhtolsD7xJoswFQHjNBFrmUgScEORQpIWTWxzHPvpGTxepQMUPuAEbgNNykNbdp9oyLMDwmUnIqU7hSmeCkYU1RWlbxh95rcgH-yvV9mm3RQnIXT3WfcUE9lM5crnbBcplVCA4jbLP7kk1tu_BFbh-6gstCr0B24gEE5zJ41WGxwTbABhAmK7aAeHbH7V55EBpLOQcpkYhWZNiMMbVsG314TM_tE9UGM8B99FmrWUqCqwMcsGwDDWK7B-uHcDD5nJxQhgV5SlMnS0IVE18Bdu4zqIzT3ZS2sgGf9Drti4P5Qkso3v1pW_fBzq-Mrd6_O7cvwF7FlRc95tOXSjjS0Woc70HGNBNd1kc0ZR9NuwV9TnvPRWbuoYu3HXz65DeWmbGaLOFdHAOUARr1fD9DL9LRDmeAHOGkYkplz9pSbWXR6vYkIInqFnvQKuwhnVOltaWa6_VG3BH0oc9T4xAZdH83DsG6eHtJlitVhH8Sx_PBfukG4x0S1qsmIWUPDwZhwUe55ly0I5ISELLL8Z3tAJpq3zrdyV6CbwOjF7-nPF7aRNuxg?
+Trying to fetch pmid 33172971
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+Trying acsPublications
+Trying uchicagoPress
+Trying nejm
+Trying futureMedicine
+Trying science_direct
+** fetching of reprint 33172971 failed from error Invalid URL 'cIp5BepthG7srw3ecaG_06Qhx8PrpoP6WpFwlAopfrE-1672941545-0-AXmWkR9H8dN8IxbsllbsHZ1SigvIVLyZ0euRPz15XW6nX_MsA3Y9dPoL0MovKxj_yUNiDnSYrVSmYNzVo-LEANJZq45ZpzDVv1GFU1qNu0PpI-0YxuWHz4dSudrD_soFz3LsTCtgLamU66ZDSsrVNGTaqqbajetYgnnhu4K-BeYnLmpOxzcMzYU9mgynHjFv0NnrdUU75kJPeOIRpgrUlqm8JRnkMq0SEvI2IPTDW0ToohbWs4bLvLX0GNKGVT_v5to_am4hEVPC9jmkfkkNOLoMmfbnZC-L2EGAKufwZgz17d89HWfaK61no8EW8y5ysZ5A9yTRfN__C_LpTG6FWw2HWyR9FgIvz799f4ysIoz52azp4a7w3G3AHCWdUBUDy6gabo_psIE4mu3dCHLcDzGNO148UT5wzxTfrQV3aatPAWjnaK6-Re0XOkABNINniMLfF6Ti-0WgY-cHyLH2RgKISy_89MeNrVJy22GToy2c_LQwZN3RT3M8M2TFXLXmi9xEE4Z_4kSRA_aRnvRjKJdMJfxhc-BYW1G-dn2SDAetNZZL7HcJW6cGlAIjNWQqTD9ieGfLxGJe0OCLysFkeY3XwRY5vTHQ-xVI-gGKBY0A9gS70DH5t_pS53fBTQZ1pK667ct-BCo5aysuQHLcXrlE9coo8k8vtQKrmQ5-Fxp2ZNV_MKLY5yqBj5yAWJI5b_O-Mp5TyE9Zzyte_cTXqYtO14DAr1ev8TwqZP3YNbunHBcvIO20uVjbNxc4m--ARi5MMuxcDg4Kvju8Dbf3YKM': No schema supplied. Perhaps you meant http://cIp5BepthG7srw3ecaG_06Qhx8PrpoP6WpFwlAopfrE-1672941545-0-AXmWkR9H8dN8IxbsllbsHZ1SigvIVLyZ0euRPz15XW6nX_MsA3Y9dPoL0MovKxj_yUNiDnSYrVSmYNzVo-LEANJZq45ZpzDVv1GFU1qNu0PpI-0YxuWHz4dSudrD_soFz3LsTCtgLamU66ZDSsrVNGTaqqbajetYgnnhu4K-BeYnLmpOxzcMzYU9mgynHjFv0NnrdUU75kJPeOIRpgrUlqm8JRnkMq0SEvI2IPTDW0ToohbWs4bLvLX0GNKGVT_v5to_am4hEVPC9jmkfkkNOLoMmfbnZC-L2EGAKufwZgz17d89HWfaK61no8EW8y5ysZ5A9yTRfN__C_LpTG6FWw2HWyR9FgIvz799f4ysIoz52azp4a7w3G3AHCWdUBUDy6gabo_psIE4mu3dCHLcDzGNO148UT5wzxTfrQV3aatPAWjnaK6-Re0XOkABNINniMLfF6Ti-0WgY-cHyLH2RgKISy_89MeNrVJy22GToy2c_LQwZN3RT3M8M2TFXLXmi9xEE4Z_4kSRA_aRnvRjKJdMJfxhc-BYW1G-dn2SDAetNZZL7HcJW6cGlAIjNWQqTD9ieGfLxGJe0OCLysFkeY3XwRY5vTHQ-xVI-gGKBY0A9gS70DH5t_pS53fBTQZ1pK667ct-BCo5aysuQHLcXrlE9coo8k8vtQKrmQ5-Fxp2ZNV_MKLY5yqBj5yAWJI5b_O-Mp5TyE9Zzyte_cTXqYtO14DAr1ev8TwqZP3YNbunHBcvIO20uVjbNxc4m--ARi5MMuxcDg4Kvju8Dbf3YKM?
+Trying to fetch pmid 34428301
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+** fetching reprint using the 'pubmed central' finder...
+** fetching of reprint 34428301 succeeded
+Trying to fetch pmid 34791440
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+** fetching reprint using the 'pubmed central' finder...
+** fetching of reprint 34791440 succeeded
+Trying to fetch pmid 9140061
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+Trying acsPublications
+Trying uchicagoPress
+Trying nejm
+Trying futureMedicine
+Trying science_direct
+** fetching of reprint 9140061 failed from error Invalid URL 'jcZKP5wy_U5fZwQzMnXsls3TZuFKtLfM.TRt.Bh4d9k-1672941556-0-ARg_NFIhKoWkMSTkM9K0NNsEvXccNV6TpzvRoQ-2vYjK8XkBbOYBTbXDk7ayo2JehzHWQXv5Q_R2fac_6YNLMbXVLvOx_MPE2G55FZnUH5eyYoAVkc294_DbWF4BkOBr9bbRZ77KShHUYqJjAOi2O7mvSeGRhr8aCrq258YcVJ0FBdP8Q5tuy2CNWxi_udpInouGKC_Bnbb4D6LtrmOH2qchHRdKNei5ina55N2xPiH6jVDZ21jK0SkCSagtetSHnT7A-CfaFwqG5cz5lnOs1l1bBFEcOFdNNkmvz5yGZK-RR1-gynCmgS1ixfHapDjmCyogIfAxI1oumhPQoHCCg8-OqSgMSXHbgJdPWvc5L68Unmk5BAZNeFU2F_-xInoVtpYPwJNkeyldxj98PbHPAYg-SqmRtv0MyKm9qcEZJIULlfwTZ2ZGAm_uAwcQ7fW_O9VfUNBlbt2SohoYWfCtILAc2Imgon6vNbdisaxRkf70SZuD0G-Fj2SCsAYhkQrqPCdJAEEfWJ1QiddGb32kTSnXCoupFAWbX441Xj4nOj5OaRem_6JScd2AJp-YxSNI0Nm4IrB8s5O_lG1o_BDYlplFwbKozatP9ckn0jeXx38wInIuKOjUgl9B_T2Xvkg6sCNxXUWsHXiHMkhQ3x2AEh47zf4T6vQoTi0wNMkUVtkNTh8gOviKKl74Pi4m3yyq1ICnA9L9D6E6MLuE_ZOfmVBM79sVEgN8jsDBojevmYv96r09rQaQ_9c5cFZk7E25jQ': No schema supplied. Perhaps you meant http://jcZKP5wy_U5fZwQzMnXsls3TZuFKtLfM.TRt.Bh4d9k-1672941556-0-ARg_NFIhKoWkMSTkM9K0NNsEvXccNV6TpzvRoQ-2vYjK8XkBbOYBTbXDk7ayo2JehzHWQXv5Q_R2fac_6YNLMbXVLvOx_MPE2G55FZnUH5eyYoAVkc294_DbWF4BkOBr9bbRZ77KShHUYqJjAOi2O7mvSeGRhr8aCrq258YcVJ0FBdP8Q5tuy2CNWxi_udpInouGKC_Bnbb4D6LtrmOH2qchHRdKNei5ina55N2xPiH6jVDZ21jK0SkCSagtetSHnT7A-CfaFwqG5cz5lnOs1l1bBFEcOFdNNkmvz5yGZK-RR1-gynCmgS1ixfHapDjmCyogIfAxI1oumhPQoHCCg8-OqSgMSXHbgJdPWvc5L68Unmk5BAZNeFU2F_-xInoVtpYPwJNkeyldxj98PbHPAYg-SqmRtv0MyKm9qcEZJIULlfwTZ2ZGAm_uAwcQ7fW_O9VfUNBlbt2SohoYWfCtILAc2Imgon6vNbdisaxRkf70SZuD0G-Fj2SCsAYhkQrqPCdJAEEfWJ1QiddGb32kTSnXCoupFAWbX441Xj4nOj5OaRem_6JScd2AJp-YxSNI0Nm4IrB8s5O_lG1o_BDYlplFwbKozatP9ckn0jeXx38wInIuKOjUgl9B_T2Xvkg6sCNxXUWsHXiHMkhQ3x2AEh47zf4T6vQoTi0wNMkUVtkNTh8gOviKKl74Pi4m3yyq1ICnA9L9D6E6MLuE_ZOfmVBM79sVEgN8jsDBojevmYv96r09rQaQ_9c5cFZk7E25jQ?
+Trying to fetch pmid 32662815
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+** fetching reprint using the 'pubmed central' finder...
+** fetching of reprint 32662815 succeeded
+Trying to fetch pmid 32817380
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+Trying acsPublications
+Trying uchicagoPress
+Trying nejm
+Trying futureMedicine
+Trying science_direct
+** fetching of reprint 32817380 failed from error Invalid URL '12Kdq1Gu6s3H_c9wV7MWTwYP1d4sz7FRpWv08o2fehs-1672941561-0-AQzKoSKCDX92o3mW4XorQJa2qF1s8dsMn24r1239tDd-OxIEJ-xofWlfZb7cmDWmkZ-d4uCYOdMgyimJ9BwqBkuJKbRguJ_HaG4KzuT0CwTAflqmSgiP6oaZRbxRIMOl3LAnhQXawFffYKLbyEKG9hEWBeEbs31LlzwG7k7IbodBBPNfYicYC2QJy8RZ5xHWPTXxcwshhdG__QByEK9fJ6RYaR8LVhOwXo-m6nKcnmcvdFAubYorAvVvggpPCiIA0EYouK_-KA_Et9mXMtoRPVhEKeO03k9LAejSpvDDd8praPe4uYMGyBe4ruFtFbjqOdJgmlwSt_hPsHu_iFLkl6eW-V_dW5iwEQOE9z1jSjKf1ZHznUnde5Nzlh3v0wV2po1Y1QuFKuy8_IO-DB4iU3MlzHKgWqCsAeLorSaui7KqJAGzqmM3Keurq7J4URVd8khAGmHXMZHt3u96krRlFp3Nsc1_jwJEKKLxr44FVFla7XnqlQIHXdzj9FffjdPd1R_p3G-UEYGLzL32dFulkql4INTbOR625BrjoAvw74XDQRcNE_P72PYyCRUSIarPTtFTQBMSfpxRaOprcTZfMR_U5zdY0uGixU3srbPeduCUA7tQOFiCiLoTD_odsa75NYCv9o_me1vJSA823Md4hCV947suGwjybNaQP-R-yrffAfni7dQRYMt-mjEHk5LtnebhpJi1G44UN_WFSDpOkB6lvKO7Qc-eoUXnm4DbeysVDTRAmVi94HkcG9tc1U7BeSVyXNUfq3C1Vr_1jJXCgUI': No schema supplied. Perhaps you meant http://12Kdq1Gu6s3H_c9wV7MWTwYP1d4sz7FRpWv08o2fehs-1672941561-0-AQzKoSKCDX92o3mW4XorQJa2qF1s8dsMn24r1239tDd-OxIEJ-xofWlfZb7cmDWmkZ-d4uCYOdMgyimJ9BwqBkuJKbRguJ_HaG4KzuT0CwTAflqmSgiP6oaZRbxRIMOl3LAnhQXawFffYKLbyEKG9hEWBeEbs31LlzwG7k7IbodBBPNfYicYC2QJy8RZ5xHWPTXxcwshhdG__QByEK9fJ6RYaR8LVhOwXo-m6nKcnmcvdFAubYorAvVvggpPCiIA0EYouK_-KA_Et9mXMtoRPVhEKeO03k9LAejSpvDDd8praPe4uYMGyBe4ruFtFbjqOdJgmlwSt_hPsHu_iFLkl6eW-V_dW5iwEQOE9z1jSjKf1ZHznUnde5Nzlh3v0wV2po1Y1QuFKuy8_IO-DB4iU3MlzHKgWqCsAeLorSaui7KqJAGzqmM3Keurq7J4URVd8khAGmHXMZHt3u96krRlFp3Nsc1_jwJEKKLxr44FVFla7XnqlQIHXdzj9FffjdPd1R_p3G-UEYGLzL32dFulkql4INTbOR625BrjoAvw74XDQRcNE_P72PYyCRUSIarPTtFTQBMSfpxRaOprcTZfMR_U5zdY0uGixU3srbPeduCUA7tQOFiCiLoTD_odsa75NYCv9o_me1vJSA823Md4hCV947suGwjybNaQP-R-yrffAfni7dQRYMt-mjEHk5LtnebhpJi1G44UN_WFSDpOkB6lvKO7Qc-eoUXnm4DbeysVDTRAmVi94HkcG9tc1U7BeSVyXNUfq3C1Vr_1jJXCgUI?
+Trying to fetch pmid 32849447
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+** fetching reprint using the 'pubmed central' finder...
+** fetching of reprint 32849447 succeeded
+Trying to fetch pmid 33068046
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+Trying acsPublications
+Trying uchicagoPress
+Trying nejm
+Trying futureMedicine
+Trying science_direct
+** fetching of reprint 33068046 failed from error Invalid URL 'Ca2tz3FARdmZBsByuyNuWeiql2uee1VreT.kVjY7yrk-1672941567-0-AexEfZqeSZGpnAKuvb4N24mFbbARpMqS4Bl7rq2oJaJLQy4XNqEXY1SvQ53OXwzuh9s8hJpJSmZKZ90s8So4WTMitRZFt0iwKRvwq5PfF8ZF-spmYvUyZmqSAcRty7hyAnlIItHCbvd0DXymu2foqGLiY7_Azyn4oIZjqDWZgwUu4cttCsPTlTJtscKhrnIDiTC2AD-6BrcAHq2eFMQXn27imPIx1RCRlJshGeDr1vbtfjlBg89wEfvUQMpUEgz-xVlFP2tkES_AqE3RIqDBCDIDkDuwxhKZ5d-k_PxAuN3Vbx-1nlLI7WeIZH3b-qHkPWg8ifOx6RsMU_A02ZEHMrjlftm66SFQ60Wsria5dpTeLxvGd34BBngLodgDKaYoG0ztHkPImcz4lT76J7-QCgKcV7O86u_4mEpHhONMbCRBLtVhcFVAX-zAMIyOWzECJ6x0Sau9cAqssr2l_Q1VT-f4uCaFA5KpmuC3IHUZQABkrvM9nh0uOhB2e7ln9OfxBG89KhjhGPRhio2LRDY4yprcBdzS-dNl1pedPEXENepuOg0R645bq0poGP4uKeYHuQ': No schema supplied. Perhaps you meant http://Ca2tz3FARdmZBsByuyNuWeiql2uee1VreT.kVjY7yrk-1672941567-0-AexEfZqeSZGpnAKuvb4N24mFbbARpMqS4Bl7rq2oJaJLQy4XNqEXY1SvQ53OXwzuh9s8hJpJSmZKZ90s8So4WTMitRZFt0iwKRvwq5PfF8ZF-spmYvUyZmqSAcRty7hyAnlIItHCbvd0DXymu2foqGLiY7_Azyn4oIZjqDWZgwUu4cttCsPTlTJtscKhrnIDiTC2AD-6BrcAHq2eFMQXn27imPIx1RCRlJshGeDr1vbtfjlBg89wEfvUQMpUEgz-xVlFP2tkES_AqE3RIqDBCDIDkDuwxhKZ5d-k_PxAuN3Vbx-1nlLI7WeIZH3b-qHkPWg8ifOx6RsMU_A02ZEHMrjlftm66SFQ60Wsria5dpTeLxvGd34BBngLodgDKaYoG0ztHkPImcz4lT76J7-QCgKcV7O86u_4mEpHhONMbCRBLtVhcFVAX-zAMIyOWzECJ6x0Sau9cAqssr2l_Q1VT-f4uCaFA5KpmuC3IHUZQABkrvM9nh0uOhB2e7ln9OfxBG89KhjhGPRhio2LRDY4yprcBdzS-dNl1pedPEXENepuOg0R645bq0poGP4uKeYHuQ?
+Trying to fetch pmid 33072717
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+** fetching reprint using the 'pubmed central' finder...
+** fetching of reprint 33072717 succeeded
+Trying to fetch pmid 33136147
+Trying genericCitationLabelled
+** fetching reprint using the 'generic citation labelled' finder...
+** fetching of reprint 33136147 succeeded
+Trying to fetch pmid 33318048
+Trying genericCitationLabelled
+Trying pubmed_central_v2
+Trying acsPublications
+Trying uchicagoPress
+Trying nejm
+Trying futureMedicine
+Trying science_direct
+** fetching of reprint 33318048 failed from error Invalid URL 'H_HDOMjPFqBRBn9CIvFp3.MLyt6.Cr1yqjPqy7_.dNo-1672941585-0-ASGJyet_JMjh-n9RCjZP1usTaU-rAh_oVPlNBV8Ox06oZLjLmr4nLazOPGibTSzbDun4wRRfxjJD1cl8pFLvWgZNLCwgScfdMEuTEYcHelG8wh84ZPO7-PimWyY4a-Ax_JW2wfMsWOrdcFRmKRfdjpL4MFDyEGVMcjhzP9y84LW4EnDyNZqVSkX_y8VAxIbmaMeuS-EiSakyeV1RnV4_bKjzzuQXXLtk8wIhc-rF2VoAiFgTRP3-kR7Y02rLN1opo-OYhoQ29Xy2fAHIKm2pS-qBW0XNRWiOU6q8_YMmMZrWbskiukxzgyZO5MutUF8ygYDuzaZDjX0BtuezjJEtcKWslPbaM1gXj1L8Yy3U7YCwi-_CPUjNOrvFnW0EEm5jeKDUVwwIeY1-sd54wUjlnn86c6qAqpaI4unKjLk4makfoIUlKr4B62VwsTRnrfbZxbqDTyl5jZjFIGiHmrmzPXxt1QG7SrQwApYoGQYFiijEUTw-7IM3t7bXcwRYTMVfbXUEhv8JrzvShSa8x1fDEwHgU2fUnY6BoCOrpC9hZShy1xlZSOOpw4AHgCW272GoFIr2PJ-Zy1UNze2TXebaUegtUpleiM-BhsDhqCAaGxAj9SQsD153z7wtiM6kCMnHOz9IhaKIkgYpKYgXwcQmuzLZUWgWJFJ0lqYeSgvlKAgHjzRY_3Jt2gPT3L2GcgUZQXWWRx4Hs4jL2tUvAiOuqPfvPWSFVjGTZPjZCd3VVFrqpcZCh2v85PiksdgNk05aMA': No schema supplied. Perhaps you meant http://H_HDOMjPFqBRBn9CIvFp3.MLyt6.Cr1yqjPqy7_.dNo-1672941585-0-ASGJyet_JMjh-n9RCjZP1usTaU-rAh_oVPlNBV8Ox06oZLjLmr4nLazOPGibTSzbDun4wRRfxjJD1cl8pFLvWgZNLCwgScfdMEuTEYcHelG8wh84ZPO7-PimWyY4a-Ax_JW2wfMsWOrdcFRmKRfdjpL4MFDyEGVMcjhzP9y84LW4EnDyNZqVSkX_y8VAxIbmaMeuS-EiSakyeV1RnV4_bKjzzuQXXLtk8wIhc-rF2VoAiFgTRP3-kR7Y02rLN1opo-OYhoQ29Xy2fAHIKm2pS-qBW0XNRWiOU6q8_YMmMZrWbskiukxzgyZO5MutUF8ygYDuzaZDjX0BtuezjJEtcKWslPbaM1gXj1L8Yy3U7YCwi-_CPUjNOrvFnW0EEm5jeKDUVwwIeY1-sd54wUjlnn86c6qAqpaI4unKjLk4makfoIUlKr4B62VwsTRnrfbZxbqDTyl5jZjFIGiHmrmzPXxt1QG7SrQwApYoGQYFiijEUTw-7IM3t7bXcwRYTMVfbXUEhv8JrzvShSa8x1fDEwHgU2fUnY6BoCOrpC9hZShy1xlZSOOpw4AHgCW272GoFIr2PJ-Zy1UNze2TXebaUegtUpleiM-BhsDhqCAaGxAj9SQsD153z7wtiM6kCMnHOz9IhaKIkgYpKYgXwcQmuzLZUWgWJFJ0lqYeSgvlKAgHjzRY_3Jt2gPT3L2GcgUZQXWWRx4Hs4jL2tUvAiOuqPfvPWSFVjGTZPjZCd3VVFrqpcZCh2v85PiksdgNk05aMA?
+```
+
+# Text extraction from PDF
+We sent PDf files to Lisen&Curate team for extracting text.
+
--- a/extraction-literature/input/list_of_PMIDs.txt 0 → 100644
View file @12e7dd6
+++ b/extraction-literature/input/list_of_PMIDs.txt 0 → 100644
View file @12e7dd6
+21097887
+23818864
+24947454
+25222563
+25275371
+25735747
+26258987
+26279566
+26670385
+26673755
+28061857
+28526842
+29394395
+30137486
+30389436
+30420454
+33172971
+34428301
+34791440
+9140061
+32662815
+32817380
+32849447
+33068046
+33072717
+33136147
+33318048