format_zika_v5.py
4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
"""
#Setup
"""
#################### Setup ####################
from optparse import OptionParser
import os
from pandas import read_csv, DataFrame, merge, concat, read_table
from numpy import mean
import format_fun_v6 as format_fun
import sys
"""
# input parameters
--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
--outputFile all_srr_IV_mapped.tsv
#Example
# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile zika.json > automatic-extraction-growth-conditions/mapping_MCO/reports/zika_formated_report.out
# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/test/ --outputFile zika_mapped.json > automatic-extraction-growth-conditions/mapping_MCO/test/zika_mapping_report.out
"""
#################### Defining parameters ####################
if __name__ == "__main__":
parser = OptionParser()
parser.add_option(
"--inputPath",
dest="input_path",
help="Path of npl tagged file (crf output)",
metavar="PATH")
parser.add_option(
"--iAnnotatedFile",
dest="npl_fname",
help="Input file of npl tagged file (crf output)",
metavar="FILE",
default="")
parser.add_option(
"--outputPath",
dest="output_path",
help="Output path to place output files",
metavar="PATH")
parser.add_option(
"--outputFile",
dest="out_fname",
help="Output file name for the mapping process",
metavar="FILE",
default="")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
#################### DISP PARAMETERS ####################
print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
print("--inputPath Path of npl tagged file: " + str(options.input_path))
print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
print("--outputPath Output path to place output files: " + str(options.output_path))
print("--outputFile Output of the mapping process: " + str(options.out_fname))
print("\n\n")
# Input files
npl_ifile = os.path.join(options.input_path, options.npl_fname)
#Output files
ofname = os.path.join(options.output_path, options.out_fname)
#################### Load input data ####################
# Load CRF-annotation
exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
npl_full = read_table(npl_ifile, sep = "\t")
npl_full = npl_full.drop_duplicates()
print(f"Total zika terms: {len(npl_full)} ")
obs_cols = set(npl_full.columns)
if exp_cols.intersection(obs_cols) != exp_cols:
ocol = ", ".join(list(exp_cols))
sys.exit(ocol + " expected columns for iAnnotatedFile" )
"""
df_terms = defaultdict(list)
for idx,row in npl_full.iterrows():
term_record = format_fun.get_term_info(row, source = "ZIKAdb", map=False)
df_terms[row.SRR].append(term_record)
df_json = {}
df_tmp = npl_full.drop_duplicates("SRR", keep="first")
for idx,row in df_tmp.iterrows():
srr_record = format_fun.created_record(
info_df = row,
term_list = df_terms[row.SRR],
source = "ZIKAdb",
esource = "database")
df_json[row.SRR] = srr_record
with open(ofname, "w") as output:
json.dump(df_json, output, separators=(',', ':'), indent=4)
df=open(ofname)
df=json.load(df)
print(df["ERR1399578"])
"""
npl_full["MAP"] = False
format_fun.to_json(
df = npl_full,
source_info = "ZIKAdb",
evidence_source = "database",
ofname = ofname)