transform_sentences_to_check_to_XML.py 2.18 KB
import pandas as pd
import os

def transform_sentence_to_check_to_XML(inputPath, outputPath, inputFile, outputFile):
    df_sentences_to_check = pd.read_csv(os.path.join(inputPath, inputFile))
    df_sentences_to_check.rename(columns={'Unnamed: 0': 'row'}, inplace=True)
    df_sentences_to_check = df_sentences_to_check.sort_values(by=['original_sentence'])
    print(df_sentences_to_check.head(5))
    with open(os.path.join(outputPath, outputFile), mode='w') as ofile:
        ofile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        ofile.write('<gcs_to_check xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="esquema-gcs-to-check.xsd">\n')
        for ind in df_sentences_to_check.index:
            #
            line = '<row id="{}">\n'.format(df_sentences_to_check['row'][ind])
            line = line + "\t<serie>{}</serie>\n".format(df_sentences_to_check['serie'][ind])
            line = line + "\t<serie_pubmed_id>{}</serie_pubmed_id>\n".format(df_sentences_to_check['serie_pubmed_id'][ind])
            line = line + "\t<sample>{}</sample>\n".format(df_sentences_to_check['sample'][ind])
            line = line + "\t<field_name>{}</field_name>\n".format(df_sentences_to_check['field_name'][ind])
            line = line + "\t<original_sentence>{}</original_sentence>\n".format(df_sentences_to_check['original_sentence'][ind])
            line = line + "\t<corrected_sentence>{}</corrected_sentence>\n".format(df_sentences_to_check['original_sentence'][ind])
            line = line + "</row>\n"
            ofile.write(line)
        ofile.write('</gcs_to_check>\n')

transform_sentence_to_check_to_XML(inputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences',
                                   outputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences',
                                   inputFile='geo_sentences_to_check_fixed.csv',
                                   #inputFile='geo_sentences_to_check.csv',
                                   outputFile='geo_sentences_to_check_fixed.xml'
                                   )