From 70aa74719e828357d0ed763980c4fcbd2fea06f3 Mon Sep 17 00:00:00 2001
From: mandiayba <mouhamadou.ba@inrae.fr>
Date: Fri, 29 Oct 2021 15:27:31 +0200
Subject: [PATCH 1/2] solution to compare full.txt results from a update to
 another

---
 compare_results.snakefile | 52 +++++++++++++++++++++++++++++++++++++++
 config/config.yaml        |  8 ++++++
 2 files changed, 60 insertions(+)
 create mode 100644 compare_results.snakefile

diff --git a/compare_results.snakefile b/compare_results.snakefile
new file mode 100644
index 00000000..7e31c609
--- /dev/null
+++ b/compare_results.snakefile
@@ -0,0 +1,52 @@
+import os
+## config file
+configfile: "config/config.yaml"
+
+
+''' 
+all
+'''
+rule all:
+    input:
+        "corpora/florilege/compare/habitats.rankdiff.txt",
+        "corpora/florilege/compare/microorganisms.rankdiff.txt",
+        "corpora/florilege/compare/phenotypes.rankdiff.txt"
+		
+
+'''
+rank
+'''
+rule rank_old_results:
+    input:
+        entities=os.path.join(config["OLD_RESULT_FOLDER"], "{entity}.full.txt")
+    output:
+        rank="corpora/florilege/compare/old/{entity}.rank.txt"
+    shell:"""
+        python softwares/Florilege/scripts/rank.py 6 7 <{input.entities} >{output.rank}
+        """
+
+'''
+rank
+'''
+rule rank_new_results:
+    input:
+        entities=os.path.join(config["NEW_RESULT_FOLDER"], "{entity}.full.txt")
+    output:
+        rank="corpora/florilege/compare/new/{entity}.rank.txt"
+    shell:"""
+        python softwares/Florilege/scripts/rank.py 6 7 <{input.entities} >{output.rank}
+        """
+
+
+'''
+compare
+'''
+rule compare:
+    input:
+        old="corpora/florilege/compare/old/{entity}.rank.txt",
+        new="corpora/florilege/compare/new/{entity}.rank.txt"
+    output:
+        diff="corpora/florilege/compare/{entity}.rankdiff.txt"
+    shell:"""
+        python softwares/Florilege/scripts/compare-ranks.py {input.old} {input.new} >{output.diff}
+        """
\ No newline at end of file
diff --git a/config/config.yaml b/config/config.yaml
index 8d9e07e7..c3b24433 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -76,3 +76,11 @@ PLAN_FOLDER : "plans/"
 SOFT_FOLDER : "softwares/"
 
 ANCI_FOLDER : "ancillaries/"
+
+## full entity results folder
+
+## put old *.full.txt in this folder 
+OLD_RESULT_FOLDER: "corpora/florilege/compare/old"
+
+## put new *.full.txt in this folder 
+NEW_RESULT_FOLDER: "corpora/florilege/compare/old"
\ No newline at end of file
-- 
GitLab


From 754a44214abd0238da86e53541a75e3a25115f70 Mon Sep 17 00:00:00 2001
From: mandiayba <mouhamadou.ba@inrae.fr>
Date: Fri, 29 Oct 2021 16:51:40 +0200
Subject: [PATCH 2/2] fix out of index error

---
 softwares/Florilege/scripts/rank.py | 53 +++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100755 softwares/Florilege/scripts/rank.py

diff --git a/softwares/Florilege/scripts/rank.py b/softwares/Florilege/scripts/rank.py
new file mode 100755
index 00000000..be60a007
--- /dev/null
+++ b/softwares/Florilege/scripts/rank.py
@@ -0,0 +1,53 @@
+# Reads a TSV file, counts occurrences of each identifier and ranks them by count
+#
+# Usage:
+#    python rank.py COLUMN_ID COLUMN_LABEL
+#
+# COLUMN_ID: column of identifiers (first is 0)
+# COLUMN_LABEL: column of the labels (first is 0)
+#
+# To count concepts, set COLUMN_ID to the concept identifier (ncbi:000000 or OBT:000000).
+# To count surface forms, set COLUMN_ID to the surface forms.
+# Set COLUMN_LABEL to the canonical form.
+#
+# Output is a TSV. Columns: rank, identifier, label, count.
+# Two items with the same count have the same rank.
+
+import sys
+import operator
+
+COLUMN_ID = int(sys.argv[1])
+COLUMN_LABEL = int(sys.argv[2])
+
+
+class Item:
+    def __init__(self, iid, label):
+        self.iid = iid
+        self.label = label
+        self.count = 1
+
+
+ITEMS = {}
+for line in sys.stdin:
+    cols = line.strip().split('\t')
+    if len(cols) <= COLUMN_ID:
+        continue
+    iid = cols[COLUMN_ID]
+    if iid in ITEMS:
+        ITEMS[iid].count += 1
+    else:
+        label = cols[COLUMN_LABEL]
+        ITEMS[iid] = Item(iid, label)
+
+RANKED = sorted(ITEMS.values(), reverse=True, key=operator.attrgetter('count'))
+
+prev_rank = 0
+prev_count = sys.maxsize
+for n, i in enumerate(RANKED):
+    if i.count == prev_count:
+        rank = prev_rank
+    else:
+        rank = prev_rank + 1
+        prev_rank = rank
+        prev_count = i.count
+    print('\t'.join((str(rank), i.iid, i.label, str(i.count))))
-- 
GitLab