From 15d3451938e6e79d2d1cd6cbfac2755986774ffa Mon Sep 17 00:00:00 2001
From: Mouhamadou Ba <mouhamadou.ba@inra.fr>
Date: Fri, 15 Oct 2021 17:54:16 +0200
Subject: [PATCH 1/2] correct cirm cfbp entries

---
 process_CIRM_corpus.snakefile | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/process_CIRM_corpus.snakefile b/process_CIRM_corpus.snakefile
index 42a9effc..b86abdd0 100644
--- a/process_CIRM_corpus.snakefile
+++ b/process_CIRM_corpus.snakefile
@@ -18,7 +18,7 @@ get taxa and habitats (CIRM BIA)
 '''
 rule get_cirm_bia_taxa_habitats:
 	input:
-		file='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.xlsx'
+		file='corpora/cirm/BIA_2021/florilege_20_07_2021.xlsx'
 	params:
 		taxa_index='2',
 		strain_index='1',
@@ -37,7 +37,7 @@ get taxa and habitats (CIRM Levures)
 '''
 rule get_cirm_yeast_taxa_habitats:
 	input:
-		file='corpora/cirm/Levures_2021/Florilege_21012021.xlsx'
+		file='corpora/cirm/Levures_2021/Florilege_23082021.xlsx'
 	params:
 		taxa_index='1',
 		habitat_index='10,11'
@@ -55,15 +55,15 @@ get taxa and habitats (CIRM CFBP)
 '''
 rule get_cirm_cfbp_taxa_habitats:
 	input:
-		file='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.xlsx'
+		file='corpora/cirm/CFBP_2021/20210617_PPortier.xlsx'
 	params:
-		taxa_index='1',
-		strain_index='0',
-		habitat_index='9,10,14,23'
+		taxa_index='3',
+		strain_index='1',
+		habitat_index='6,10,13,14'
 	output:
 		taxa='corpora/cirm/cfbp_taxa.txt',
 		habitats='corpora/cirm/cfbp_habitats.txt',
-		tsv='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.tsv'
+		tsv='corpora/cirm/CFBP_2021/20210617_PPortier.tsv'
 	conda: 'softwares/envs/python3_pandas_env.yaml'
 	shell: """
 		python3.7 softwares/Florilege/scripts/preprocess-cirm-cfbp.py --input {input.file} --taxa-index {params.taxa_index} --strain-index {params.strain_index} --habitat-index {params.habitat_index} --taxa-outfile {output.taxa} --habitat-outfile {output.habitats} --tsv-outfile {output.tsv}
@@ -241,15 +241,15 @@ format results (CIRM CFBP)
 '''
 rule format_cirm_cfbp_results:
 	input:
-		file='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.tsv',
+		file='corpora/cirm/CFBP_2021/20210617_PPortier.tsv',
 		taxa='corpora/cirm/mapped_cfbp_taxa.txt',
 		habitats='corpora/cirm/mapped_cfbp_habitats.txt'
 	output:
 		result='corpora/florilege/cirm/cirm-cfbp-results.txt'
 	params:
-		taxa_index='1',
-		strain_index='0',
-		habitat_index='9,10,14,23'
+		taxa_index='3',
+		strain_index='1',
+		habitat_index='6,10,13,14'
 	conda: 'softwares/envs/obo-utils-env.yaml'
 	shell: 'python softwares/Florilege/scripts/format-cirm-cfbp-results.py --cirm {input.file} --taxa {input.taxa} --habitats {input.habitats} --taxa-index {params.taxa_index} --strain-index {params.strain_index} --habitat-index {params.habitat_index} > {output.result}'
 
-- 
GitLab


From 03a2d9285333014f2ebf401b03c00a4e89ecd05a Mon Sep 17 00:00:00 2001
From: "louise.deleger" <louise.deleger@inra.fr>
Date: Mon, 18 Oct 2021 12:15:35 +0200
Subject: [PATCH 2/2] Fixed tsv conversion + weird quotes

---
 softwares/Florilege/scripts/preprocess-cirm-cfbp.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/softwares/Florilege/scripts/preprocess-cirm-cfbp.py b/softwares/Florilege/scripts/preprocess-cirm-cfbp.py
index c24e2f52..a7452277 100644
--- a/softwares/Florilege/scripts/preprocess-cirm-cfbp.py
+++ b/softwares/Florilege/scripts/preprocess-cirm-cfbp.py
@@ -18,12 +18,13 @@ strain_index = int(args.strain_index)
 habitat_indexes = map(int,args.habitat_index.split(','))
 
 cirm_data = pd.read_excel(args.input, dtype='object')
+cirm_data.replace(to_replace=[r"\\n|\\r", "\n|\r"], value=[" "," "], regex=True, inplace=True)
+cirm_data.replace(to_replace=[r'["“”]'], value=[""], regex=True, inplace=True)
 
 taxa_dict = set()
 for i in range(len(cirm_data)) :
   taxon = str(cirm_data.iloc[i, taxa_index])
   strain = str(cirm_data.iloc[i, strain_index])
-  taxon = taxon.replace('"','')
   taxa_dict.add(taxon)
   taxa_dict.add(taxon + " CFBP " + strain)
   taxa_dict.add(taxon + " CFBP" + strain)
-- 
GitLab