From 70aa74719e828357d0ed763980c4fcbd2fea06f3 Mon Sep 17 00:00:00 2001 From: mandiayba <mouhamadou.ba@inrae.fr> Date: Fri, 29 Oct 2021 15:27:31 +0200 Subject: [PATCH 1/2] solution to compare full.txt results from a update to another --- compare_results.snakefile | 52 +++++++++++++++++++++++++++++++++++++++ config/config.yaml | 8 ++++++ 2 files changed, 60 insertions(+) create mode 100644 compare_results.snakefile diff --git a/compare_results.snakefile b/compare_results.snakefile new file mode 100644 index 00000000..7e31c609 --- /dev/null +++ b/compare_results.snakefile @@ -0,0 +1,52 @@ +import os +## config file +configfile: "config/config.yaml" + + +''' +all +''' +rule all: + input: + "corpora/florilege/compare/habitats.rankdiff.txt", + "corpora/florilege/compare/microorganisms.rankdiff.txt", + "corpora/florilege/compare/phenotypes.rankdiff.txt" + + +''' +rank +''' +rule rank_old_results: + input: + entities=os.path.join(config["OLD_RESULT_FOLDER"], "{entity}.full.txt") + output: + rank="corpora/florilege/compare/old/{entity}.rank.txt" + shell:""" + python softwares/Florilege/scripts/rank.py 6 7 <{input.entities} >{output.rank} + """ + +''' +rank +''' +rule rank_new_results: + input: + entities=os.path.join(config["NEW_RESULT_FOLDER"], "{entity}.full.txt") + output: + rank="corpora/florilege/compare/new/{entity}.rank.txt" + shell:""" + python softwares/Florilege/scripts/rank.py 6 7 <{input.entities} >{output.rank} + """ + + +''' +compare +''' +rule compare: + input: + old="corpora/florilege/compare/old/{entity}.rank.txt", + new="corpora/florilege/compare/new/{entity}.rank.txt" + output: + diff="corpora/florilege/compare/{entity}.rankdiff.txt" + shell:""" + python softwares/Florilege/scripts/compare-ranks.py {input.old} {input.new} >{output.diff} + """ \ No newline at end of file diff --git a/config/config.yaml b/config/config.yaml index 8d9e07e7..c3b24433 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -76,3 +76,11 @@ PLAN_FOLDER : "plans/" SOFT_FOLDER : "softwares/" ANCI_FOLDER : "ancillaries/" + +## full entity results folder + +## put old *.full.txt in this folder +OLD_RESULT_FOLDER: "corpora/florilege/compare/old" + +## put new *.full.txt in this folder +NEW_RESULT_FOLDER: "corpora/florilege/compare/old" \ No newline at end of file -- GitLab From 754a44214abd0238da86e53541a75e3a25115f70 Mon Sep 17 00:00:00 2001 From: mandiayba <mouhamadou.ba@inrae.fr> Date: Fri, 29 Oct 2021 16:51:40 +0200 Subject: [PATCH 2/2] fix out of index error --- softwares/Florilege/scripts/rank.py | 53 +++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100755 softwares/Florilege/scripts/rank.py diff --git a/softwares/Florilege/scripts/rank.py b/softwares/Florilege/scripts/rank.py new file mode 100755 index 00000000..be60a007 --- /dev/null +++ b/softwares/Florilege/scripts/rank.py @@ -0,0 +1,53 @@ +# Reads a TSV file, counts occurrences of each identifier and ranks them by count +# +# Usage: +# python rank.py COLUMN_ID COLUMN_LABEL +# +# COLUMN_ID: column of identifiers (first is 0) +# COLUMN_LABEL: column of the labels (first is 0) +# +# To count concepts, set COLUMN_ID to the concept identifier (ncbi:000000 or OBT:000000). +# To count surface forms, set COLUMN_ID to the surface forms. +# Set COLUMN_LABEL to the canonical form. +# +# Output is a TSV. Columns: rank, identifier, label, count. +# Two items with the same count have the same rank. + +import sys +import operator + +COLUMN_ID = int(sys.argv[1]) +COLUMN_LABEL = int(sys.argv[2]) + + +class Item: + def __init__(self, iid, label): + self.iid = iid + self.label = label + self.count = 1 + + +ITEMS = {} +for line in sys.stdin: + cols = line.strip().split('\t') + if len(cols) <= COLUMN_ID: + continue + iid = cols[COLUMN_ID] + if iid in ITEMS: + ITEMS[iid].count += 1 + else: + label = cols[COLUMN_LABEL] + ITEMS[iid] = Item(iid, label) + +RANKED = sorted(ITEMS.values(), reverse=True, key=operator.attrgetter('count')) + +prev_rank = 0 +prev_count = sys.maxsize +for n, i in enumerate(RANKED): + if i.count == prev_count: + rank = prev_rank + else: + rank = prev_rank + 1 + prev_rank = rank + prev_count = i.count + print('\t'.join((str(rank), i.iid, i.label, str(i.count)))) -- GitLab