From 15d3451938e6e79d2d1cd6cbfac2755986774ffa Mon Sep 17 00:00:00 2001 From: Mouhamadou Ba <mouhamadou.ba@inra.fr> Date: Fri, 15 Oct 2021 17:54:16 +0200 Subject: [PATCH 1/2] correct cirm cfbp entries --- process_CIRM_corpus.snakefile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/process_CIRM_corpus.snakefile b/process_CIRM_corpus.snakefile index 42a9effc..b86abdd0 100644 --- a/process_CIRM_corpus.snakefile +++ b/process_CIRM_corpus.snakefile @@ -18,7 +18,7 @@ get taxa and habitats (CIRM BIA) ''' rule get_cirm_bia_taxa_habitats: input: - file='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.xlsx' + file='corpora/cirm/BIA_2021/florilege_20_07_2021.xlsx' params: taxa_index='2', strain_index='1', @@ -37,7 +37,7 @@ get taxa and habitats (CIRM Levures) ''' rule get_cirm_yeast_taxa_habitats: input: - file='corpora/cirm/Levures_2021/Florilege_21012021.xlsx' + file='corpora/cirm/Levures_2021/Florilege_23082021.xlsx' params: taxa_index='1', habitat_index='10,11' @@ -55,15 +55,15 @@ get taxa and habitats (CIRM CFBP) ''' rule get_cirm_cfbp_taxa_habitats: input: - file='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.xlsx' + file='corpora/cirm/CFBP_2021/20210617_PPortier.xlsx' params: - taxa_index='1', - strain_index='0', - habitat_index='9,10,14,23' + taxa_index='3', + strain_index='1', + habitat_index='6,10,13,14' output: taxa='corpora/cirm/cfbp_taxa.txt', habitats='corpora/cirm/cfbp_habitats.txt', - tsv='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.tsv' + tsv='corpora/cirm/CFBP_2021/20210617_PPortier.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ python3.7 softwares/Florilege/scripts/preprocess-cirm-cfbp.py --input {input.file} --taxa-index {params.taxa_index} --strain-index {params.strain_index} --habitat-index {params.habitat_index} --taxa-outfile {output.taxa} --habitat-outfile {output.habitats} --tsv-outfile {output.tsv} @@ -241,15 +241,15 @@ format results (CIRM CFBP) ''' rule format_cirm_cfbp_results: input: - file='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.tsv', + file='corpora/cirm/CFBP_2021/20210617_PPortier.tsv', taxa='corpora/cirm/mapped_cfbp_taxa.txt', habitats='corpora/cirm/mapped_cfbp_habitats.txt' output: result='corpora/florilege/cirm/cirm-cfbp-results.txt' params: - taxa_index='1', - strain_index='0', - habitat_index='9,10,14,23' + taxa_index='3', + strain_index='1', + habitat_index='6,10,13,14' conda: 'softwares/envs/obo-utils-env.yaml' shell: 'python softwares/Florilege/scripts/format-cirm-cfbp-results.py --cirm {input.file} --taxa {input.taxa} --habitats {input.habitats} --taxa-index {params.taxa_index} --strain-index {params.strain_index} --habitat-index {params.habitat_index} > {output.result}' -- GitLab From 03a2d9285333014f2ebf401b03c00a4e89ecd05a Mon Sep 17 00:00:00 2001 From: "louise.deleger" <louise.deleger@inra.fr> Date: Mon, 18 Oct 2021 12:15:35 +0200 Subject: [PATCH 2/2] Fixed tsv conversion + weird quotes --- softwares/Florilege/scripts/preprocess-cirm-cfbp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/softwares/Florilege/scripts/preprocess-cirm-cfbp.py b/softwares/Florilege/scripts/preprocess-cirm-cfbp.py index c24e2f52..a7452277 100644 --- a/softwares/Florilege/scripts/preprocess-cirm-cfbp.py +++ b/softwares/Florilege/scripts/preprocess-cirm-cfbp.py @@ -18,12 +18,13 @@ strain_index = int(args.strain_index) habitat_indexes = map(int,args.habitat_index.split(',')) cirm_data = pd.read_excel(args.input, dtype='object') +cirm_data.replace(to_replace=[r"\\n|\\r", "\n|\r"], value=[" "," "], regex=True, inplace=True) +cirm_data.replace(to_replace=[r'["“â€]'], value=[""], regex=True, inplace=True) taxa_dict = set() for i in range(len(cirm_data)) : taxon = str(cirm_data.iloc[i, taxa_index]) strain = str(cirm_data.iloc[i, strain_index]) - taxon = taxon.replace('"','') taxa_dict.add(taxon) taxa_dict.add(taxon + " CFBP " + strain) taxa_dict.add(taxon + " CFBP" + strain) -- GitLab