Datasets-Metrics-Viewer

Running

App Files Files Community

hynky HF staff commited on 18 days ago

Commit

276d919

•

1 Parent(s): 638184c

⚡️ make it faster

Browse files

Files changed (3) hide show

src/logic/data_fetching.py +11 -5
src/logic/data_processing.py +31 -17
src/logic/plotting.py +1 -1

src/logic/data_fetching.py CHANGED Viewed

@@ -6,7 +6,7 @@ import tempfile
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict
-from datatrove.io import get_datafolder
 from datatrove.utils.stats import MetricStatsDict
 import gradio as gr
 import tenacity
@@ -17,11 +17,17 @@ def find_folders(base_folder: str, path: str) -> List[str]:
     base_folder_df = get_datafolder(base_folder)
     if not base_folder_df.exists(path):
         return []
-    return [
             folder
-            for folder,info in base_folder_df.find(path, maxdepth=1, withdirs=True, detail=True).items()
             if info["type"] == "directory" and not (folder.rstrip("/") == path.rstrip("/"))
-        ]
 def fetch_datasets(base_folder: str, progress=gr.Progress()):
     datasets = sorted(progress.tqdm(find_folders(base_folder, "")))
@@ -111,7 +117,7 @@ def fetch_graph_data(
         progress=gr.Progress(),
 ):
     if len(datasets) <= 0 or not metric_name or not grouping:
-        return None
     with ThreadPoolExecutor() as pool:
         data = list(

 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict
+from datatrove.io import get_datafolder, _get_true_fs
 from datatrove.utils.stats import MetricStatsDict
 import gradio as gr
 import tenacity
     base_folder_df = get_datafolder(base_folder)
     if not base_folder_df.exists(path):
         return []
+    from huggingface_hub import HfFileSystem
+    extra_options = {}
+    if isinstance(_get_true_fs(base_folder_df.fs), HfFileSystem):
+        extra_options["expand_info"] = False  # speed up
+    return (
             folder
+            for folder,info in base_folder_df.find(path, maxdepth=1, withdirs=True, detail=True, **extra_options).items()
             if info["type"] == "directory" and not (folder.rstrip("/") == path.rstrip("/"))
+        )
 def fetch_datasets(base_folder: str, progress=gr.Progress()):
     datasets = sorted(progress.tqdm(find_folders(base_folder, "")))
         progress=gr.Progress(),
 ):
     if len(datasets) <= 0 or not metric_name or not grouping:
+        return None, None
     with ThreadPoolExecutor() as pool:
         data = list(

src/logic/data_processing.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from datetime import datetime
 import json
 import re
 import heapq
@@ -13,30 +14,43 @@ from src.logic.graph_settings import Grouping
 PARTITION_OPTIONS = Literal["Top", "Bottom", "Most frequent (n_docs)"]
 def prepare_for_non_grouped_plotting(metric: Dict[str, MetricStatsDict], normalization: bool, rounding: int) -> Dict[float, float]:
-    metrics_rounded = defaultdict(lambda: 0)
-    for key, value in metric.items():
-        metrics_rounded[round(float(key), rounding)] += value.total
     if normalization:
-        normalizer = sum(metrics_rounded.values())
-        metrics_rounded = {k: v / normalizer for k, v in metrics_rounded.items()}
-        assert abs(sum(metrics_rounded.values()) - 1) < 0.01
-    return metrics_rounded
 def prepare_for_group_plotting(metric: Dict[str, MetricStatsDict], top_k: int, direction: PARTITION_OPTIONS, regex: str | None, rounding: int) -> Tuple[List[str], List[float], List[float]]:
     regex_compiled = re.compile(regex) if regex else None
-    metric = {key: value for key, value in metric.items() if not regex or regex_compiled.match(key)}
-    means = {key: round(float(value.mean), rounding) for key, value in metric.items()}
     if direction == "Top":
-        keys = heapq.nlargest(top_k, means, key=means.get)
     elif direction == "Most frequent (n_docs)":
-        totals = {key: int(value.n) for key, value in metric.items()}
-        keys = heapq.nlargest(top_k, totals, key=totals.get)
     else:
-        keys = heapq.nsmallest(top_k, means, key=means.get)
-    means = [means[key] for key in keys]
-    stds = [metric[key].standard_deviation for key in keys]
-    return keys, means, stds
 def export_data(exported_data: Dict[str, MetricStatsDict], metric_name: str, grouping: Grouping):
     if not exported_data:

 from datetime import datetime
+import numpy as np
 import json
 import re
 import heapq
 PARTITION_OPTIONS = Literal["Top", "Bottom", "Most frequent (n_docs)"]
 def prepare_for_non_grouped_plotting(metric: Dict[str, MetricStatsDict], normalization: bool, rounding: int) -> Dict[float, float]:
+    keys = np.array([float(key) for key in metric.keys()])
+    values = np.array([value.total for value in metric.values()])
+    rounded_keys = np.round(keys, rounding)
+    unique_keys, indices = np.unique(rounded_keys, return_inverse=True)
+    metrics_rounded = np.zeros_like(unique_keys, dtype=float)
+    np.add.at(metrics_rounded, indices, values)
     if normalization:
+        normalizer = np.sum(metrics_rounded)
+        metrics_rounded /= normalizer
+    return dict(zip(unique_keys, metrics_rounded))
 def prepare_for_group_plotting(metric: Dict[str, MetricStatsDict], top_k: int, direction: PARTITION_OPTIONS, regex: str | None, rounding: int) -> Tuple[List[str], List[float], List[float]]:
     regex_compiled = re.compile(regex) if regex else None
+    filtered_metric = {key: value for key, value in metric.items() if not regex or regex_compiled.match(key)}
+    keys = np.array(list(filtered_metric.keys()))
+    means = np.array([float(value.mean) for value in filtered_metric.values()])
+    stds = np.array([value.standard_deviation for value in filtered_metric.values()])
+    rounded_means = np.round(means, rounding)
     if direction == "Top":
+        top_indices = np.argsort(rounded_means)[-top_k:][::-1]
     elif direction == "Most frequent (n_docs)":
+        totals = np.array([int(value.n) for value in filtered_metric.values()])
+        top_indices = np.argsort(totals)[-top_k:][::-1]
     else:
+        top_indices = np.argsort(rounded_means)[:top_k]
+    top_keys = keys[top_indices]
+    top_means = rounded_means[top_indices]
+    top_stds = stds[top_indices]
+    return top_keys.tolist(), top_means.tolist(), top_stds.tolist()
 def export_data(exported_data: Dict[str, MetricStatsDict], metric_name: str, grouping: Grouping):
     if not exported_data:

src/logic/plotting.py CHANGED Viewed

@@ -11,7 +11,7 @@ from src.logic.utils import set_alpha
 from datatrove.utils.stats import MetricStatsDict
 def plot_scatter(
-        data: Dict[str, Dict[float, float]],
         metric_name: str,
         log_scale_x: bool,
         log_scale_y: bool,

 from datatrove.utils.stats import MetricStatsDict
 def plot_scatter(
+        data: Dict[str, MetricStatsDict],
         metric_name: str,
         log_scale_x: bool,
         log_scale_y: bool,