lvwerra HF staff commited on
Commit
5f757f8
1 Parent(s): 90c19cd

Update Space (evaluate main: 0b7ed95a)

Browse files
Files changed (2) hide show
  1. README.md +11 -14
  2. rouge.py +10 -11
README.md CHANGED
@@ -38,12 +38,8 @@ At minimum, this metric takes as input a list of predictions and a list of refer
38
  >>> references = ["hello there", "general kenobi"]
39
  >>> results = rouge.compute(predictions=predictions,
40
  ... references=references)
41
- >>> print(list(results.keys()))
42
- ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
43
- >>> print(results["rouge1"])
44
- AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0))
45
- >>> print(results["rouge1"].mid.fmeasure)
46
- 1.0
47
  ```
48
 
49
  ### Inputs
@@ -62,18 +58,18 @@ AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(pre
62
  - **use_stemmer** (`boolean`): If `True`, uses Porter stemmer to strip word suffixes. Defaults to `False`.
63
 
64
  ### Output Values
65
- The output is a dictionary with one entry for each rouge type in the input list `rouge_types`. If `use_aggregator=False`, each dictionary entry is a list of Score objects, with one score for each sentence. Each Score object includes the `precision`, `recall`, and `fmeasure`. E.g. if `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=False`, the output is:
66
 
67
  ```python
68
- {'rouge1': [Score(precision=1.0, recall=0.5, fmeasure=0.6666666666666666), Score(precision=1.0, recall=1.0, fmeasure=1.0)], 'rouge2': [Score(precision=0.0, recall=0.0, fmeasure=0.0), Score(precision=1.0, recall=1.0, fmeasure=1.0)]}
69
  ```
70
 
71
  If `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=True`, the output is of the following format:
72
  ```python
73
- {'rouge1': AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0)), 'rouge2': AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0))}
74
  ```
75
 
76
- The `precision`, `recall`, and `fmeasure` values all have a range of 0 to 1.
77
 
78
 
79
  #### Values from Popular Papers
@@ -86,11 +82,12 @@ An example without aggregation:
86
  >>> predictions = ["hello goodbye", "ankh morpork"]
87
  >>> references = ["goodbye", "general kenobi"]
88
  >>> results = rouge.compute(predictions=predictions,
89
- ... references=references)
 
90
  >>> print(list(results.keys()))
91
  ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
92
  >>> print(results["rouge1"])
93
- [Score(precision=0.5, recall=0.5, fmeasure=0.5), Score(precision=0.0, recall=0.0, fmeasure=0.0)]
94
  ```
95
 
96
  The same example, but with aggregation:
@@ -104,7 +101,7 @@ The same example, but with aggregation:
104
  >>> print(list(results.keys()))
105
  ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
106
  >>> print(results["rouge1"])
107
- AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.25, recall=0.25, fmeasure=0.25), high=Score(precision=0.5, recall=0.5, fmeasure=0.5))
108
  ```
109
 
110
  The same example, but only calculating `rouge_1`:
@@ -119,7 +116,7 @@ The same example, but only calculating `rouge_1`:
119
  >>> print(list(results.keys()))
120
  ['rouge1']
121
  >>> print(results["rouge1"])
122
- AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.25, recall=0.25, fmeasure=0.25), high=Score(precision=0.5, recall=0.5, fmeasure=0.5))
123
  ```
124
 
125
  ## Limitations and Bias
 
38
  >>> references = ["hello there", "general kenobi"]
39
  >>> results = rouge.compute(predictions=predictions,
40
  ... references=references)
41
+ >>> print(results)
42
+ {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
 
 
 
 
43
  ```
44
 
45
  ### Inputs
 
58
  - **use_stemmer** (`boolean`): If `True`, uses Porter stemmer to strip word suffixes. Defaults to `False`.
59
 
60
  ### Output Values
61
+ The output is a dictionary with one entry for each rouge type in the input list `rouge_types`. If `use_aggregator=False`, each dictionary entry is a list of scores, with one score for each sentence. E.g. if `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=False`, the output is:
62
 
63
  ```python
64
+ {'rouge1': [0.6666666666666666, 1.0], 'rouge2': [0.0, 1.0]}
65
  ```
66
 
67
  If `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=True`, the output is of the following format:
68
  ```python
69
+ {'rouge1': 1.0, 'rouge2': 1.0}
70
  ```
71
 
72
+ The ROUGE values are in the range of 0 to 1.
73
 
74
 
75
  #### Values from Popular Papers
 
82
  >>> predictions = ["hello goodbye", "ankh morpork"]
83
  >>> references = ["goodbye", "general kenobi"]
84
  >>> results = rouge.compute(predictions=predictions,
85
+ ... references=references,
86
+ ... use_aggregator=False)
87
  >>> print(list(results.keys()))
88
  ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
89
  >>> print(results["rouge1"])
90
+ [0.5, 0.0]
91
  ```
92
 
93
  The same example, but with aggregation:
 
101
  >>> print(list(results.keys()))
102
  ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
103
  >>> print(results["rouge1"])
104
+ 0.25
105
  ```
106
 
107
  The same example, but only calculating `rouge_1`:
 
116
  >>> print(list(results.keys()))
117
  ['rouge1']
118
  >>> print(results["rouge1"])
119
+ 0.25
120
  ```
121
 
122
  ## Limitations and Bias
rouge.py CHANGED
@@ -65,22 +65,18 @@ Args:
65
  use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
66
  use_aggregator: Return aggregates if this is set to True
67
  Returns:
68
- rouge1: rouge_1 (precision, recall, f1),
69
- rouge2: rouge_2 (precision, recall, f1),
70
- rougeL: rouge_l (precision, recall, f1),
71
- rougeLsum: rouge_lsum (precision, recall, f1)
72
  Examples:
73
 
74
  >>> rouge = evaluate.load('rouge')
75
  >>> predictions = ["hello there", "general kenobi"]
76
  >>> references = ["hello there", "general kenobi"]
77
  >>> results = rouge.compute(predictions=predictions, references=references)
78
- >>> print(list(results.keys()))
79
- ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
80
- >>> print(results["rouge1"])
81
- AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0))
82
- >>> print(results["rouge1"].mid.fmeasure)
83
- 1.0
84
  """
85
 
86
 
@@ -123,9 +119,12 @@ class Rouge(evaluate.EvaluationModule):
123
 
124
  if use_aggregator:
125
  result = aggregator.aggregate()
 
 
 
126
  else:
127
  result = {}
128
  for key in scores[0]:
129
- result[key] = list(score[key] for score in scores)
130
 
131
  return result
 
65
  use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
66
  use_aggregator: Return aggregates if this is set to True
67
  Returns:
68
+ rouge1: rouge_1 (f1),
69
+ rouge2: rouge_2 (f1),
70
+ rougeL: rouge_l (f1),
71
+ rougeLsum: rouge_lsum (f1)
72
  Examples:
73
 
74
  >>> rouge = evaluate.load('rouge')
75
  >>> predictions = ["hello there", "general kenobi"]
76
  >>> references = ["hello there", "general kenobi"]
77
  >>> results = rouge.compute(predictions=predictions, references=references)
78
+ >>> print(results)
79
+ {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
 
 
 
 
80
  """
81
 
82
 
 
119
 
120
  if use_aggregator:
121
  result = aggregator.aggregate()
122
+ for key in result:
123
+ result[key] = result[key].mid.fmeasure
124
+
125
  else:
126
  result = {}
127
  for key in scores[0]:
128
+ result[key] = list(score[key].fmeasure for score in scores)
129
 
130
  return result