transform_sentences_to_check_to_XML.py
2.18 KB
import pandas as pd
import os
def transform_sentence_to_check_to_XML(inputPath, outputPath, inputFile, outputFile):
df_sentences_to_check = pd.read_csv(os.path.join(inputPath, inputFile))
df_sentences_to_check.rename(columns={'Unnamed: 0': 'row'}, inplace=True)
df_sentences_to_check = df_sentences_to_check.sort_values(by=['original_sentence'])
print(df_sentences_to_check.head(5))
with open(os.path.join(outputPath, outputFile), mode='w') as ofile:
ofile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
ofile.write('<gcs_to_check xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="esquema-gcs-to-check.xsd">\n')
for ind in df_sentences_to_check.index:
#
line = '<row id="{}">\n'.format(df_sentences_to_check['row'][ind])
line = line + "\t<serie>{}</serie>\n".format(df_sentences_to_check['serie'][ind])
line = line + "\t<serie_pubmed_id>{}</serie_pubmed_id>\n".format(df_sentences_to_check['serie_pubmed_id'][ind])
line = line + "\t<sample>{}</sample>\n".format(df_sentences_to_check['sample'][ind])
line = line + "\t<field_name>{}</field_name>\n".format(df_sentences_to_check['field_name'][ind])
line = line + "\t<original_sentence>{}</original_sentence>\n".format(df_sentences_to_check['original_sentence'][ind])
line = line + "\t<corrected_sentence>{}</corrected_sentence>\n".format(df_sentences_to_check['original_sentence'][ind])
line = line + "</row>\n"
ofile.write(line)
ofile.write('</gcs_to_check>\n')
transform_sentence_to_check_to_XML(inputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences',
outputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences',
inputFile='geo_sentences_to_check_fixed.csv',
#inputFile='geo_sentences_to_check.csv',
outputFile='geo_sentences_to_check_fixed.xml'
)