From c00ef48c50ae7d20244891fa212ecd707b369035 Mon Sep 17 00:00:00 2001 From: maru0804 Date: Tue, 13 Jan 2026 22:32:01 +0900 Subject: [PATCH 1/2] feat(evaluation): Add CJK tokenizer support for ROUGE-1 evaluation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #4122 The default ROUGE tokenizer only recognizes ASCII alphanumeric characters, causing ROUGE-1 scores to be 0.0 for CJK (Chinese, Japanese, Korean) text. Changes: - Add CJKTokenizer class that handles CJK characters individually while preserving word-based tokenization for ASCII alphanumeric characters - Add RougeScoreCriterion to allow explicit tokenizer specification - Update RougeEvaluator to support custom tokenizers - Add warning when CJK text is detected without CJK tokenizer - Preserve backward compatibility: default behavior unchanged Usage: criterion = RougeScoreCriterion(threshold=0.8, tokenizer='cjk') eval_metric = EvalMetric( metric_name='response_match_score', threshold=0.8, criterion=criterion, ) Limitations: - Fullwidth alphanumeric (A-Z, 0-9) are skipped - Greek, Cyrillic, and other non-CJK scripts are skipped - This is character-based tokenization, not morphological analysis --- src/google/adk/dependencies/rouge_scorer.py | 1 + src/google/adk/evaluation/eval_metrics.py | 24 ++ .../adk/evaluation/final_response_match_v1.py | 187 +++++++++++++++- .../adk/evaluation/response_evaluator.py | 11 +- .../test_final_response_match_v1.py | 205 ++++++++++++++++++ 5 files changed, 418 insertions(+), 10 deletions(-) diff --git a/src/google/adk/dependencies/rouge_scorer.py b/src/google/adk/dependencies/rouge_scorer.py index cc987deb88..d9371ef46a 100644 --- a/src/google/adk/dependencies/rouge_scorer.py +++ b/src/google/adk/dependencies/rouge_scorer.py @@ -15,3 +15,4 @@ from __future__ import annotations from rouge_score import rouge_scorer +from rouge_score import tokenizers diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index 3047922c3f..11525d077d 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -251,6 +251,30 @@ class LlmBackedUserSimulatorCriterion(LlmAsAJudgeCriterion): ) +class RougeScoreCriterion(BaseCriterion): + """Criterion for ROUGE score evaluation with tokenizer options. + + This criterion allows specifying a custom tokenizer for ROUGE-1 + evaluation, particularly useful for CJK languages (Chinese, Japanese, + Korean) where the default tokenizer produces zero scores. + + Note: The `threshold` field is inherited from BaseCriterion but is + IGNORED by RougeEvaluator. Always use EvalMetric.threshold instead. + """ + + tokenizer: Optional[str] = Field( + default=None, + description=( + "Tokenizer for text tokenization. Options:\n" + "- None: Default word-based tokenization (ASCII alphanumeric only).\n" + " Non-ASCII text will produce score=0.0.\n" + "- 'cjk': Character-based tokenization for CJK (Chinese, Japanese,\n" + " Korean) + ASCII alphanumeric. Other scripts (Greek, Cyrillic,\n" + " fullwidth alphanumeric, etc.) are skipped. Stemming is disabled." + ), + ) + + class EvalMetric(EvalBaseModel): """A metric used to evaluate a particular aspect of an eval case.""" diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index fb17fe80eb..6126301353 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -14,29 +14,166 @@ from __future__ import annotations +import logging +import re +from typing import ClassVar +from typing import List from typing import Optional from google.genai import types as genai_types +from pydantic import ValidationError from typing_extensions import override from ..dependencies.rouge_scorer import rouge_scorer +from ..dependencies.rouge_scorer import tokenizers from .eval_case import ConversationScenario from .eval_case import Invocation +from .eval_metrics import BaseCriterion from .eval_metrics import EvalMetric +from .eval_metrics import RougeScoreCriterion from .evaluator import EvalStatus from .evaluator import EvaluationResult from .evaluator import Evaluator from .evaluator import PerInvocationResult +logger = logging.getLogger("google_adk." + __name__) + + +# ============================================================================= +# CJK Character Ranges +# ============================================================================= +# Each range is defined separately for maintainability. +# Order: Han (Chinese/Japanese/Korean) -> Japanese Kana -> Korean Hangul + +CJK_RANGES = ( + "\u4e00-\u9fff" # CJK Unified Ideographs (Han) + "\u3400-\u4dbf" # CJK Extension A (Han) + "\u3040-\u309f" # Hiragana (Japanese) + "\u30a0-\u30ff" # Katakana (Japanese) + "\uac00-\ud7af" # Hangul Syllables (Korean) +) + +# CJK Symbols and Punctuation block (U+3000-U+303F) +# Includes: 。、!?「」『』【】〈〉《》〔〕 etc. +# Note: Fullwidth forms (U+FF00-U+FFEF) are NOT included here. +CJK_PUNCTUATION = "\u3000-\u303f" + +CJK_CHAR_PATTERN = re.compile(f"[{CJK_RANGES}]") +CJK_PUNCT_PATTERN = re.compile(f"[{CJK_PUNCTUATION}]") + + +def _is_ascii_alnum(char: str) -> bool: + """Check if char is lowercase ASCII alphanumeric. + + This function is designed to be called AFTER text.lower(). + It only matches 'a'-'z' and '0'-'9', not 'A'-'Z'. + + Args: + char: A single character (assumed to be lowercase). + + Returns: + True if char is in 'a'-'z' or '0'-'9'. + """ + return ("a" <= char <= "z") or ("0" <= char <= "9") + + +def _contains_cjk(text: str) -> bool: + """Check if text contains any CJK characters.""" + return bool(CJK_CHAR_PATTERN.search(text)) if text else False + + +class CJKTokenizer(tokenizers.Tokenizer): + """Character-based tokenizer for CJK + ASCII alphanumeric mixed text. + + This tokenizer is designed for evaluating text in CJK languages + (Chinese, Japanese, Korean) where the default ROUGE tokenizer fails + because it only recognizes ASCII alphanumeric characters. + + Tokenization strategy: + - CJK characters: Each character becomes one token + - ASCII alphanumeric (a-z, 0-9): Word-based tokenization + - CJK punctuation/symbols (U+3000-U+303F): Removed + - All other characters: Skipped (not tokenized) + + Limitations: + - Fullwidth alphanumeric (A-Z, 0-9): Skipped + - Greek, Cyrillic, accented Latin: Skipped + - This is NOT a general multilingual tokenizer + + For morphological analysis, consider language-specific tokenizers + (e.g., MeCab for Japanese). + + Note: Stemming is not applicable to CJK and is always disabled. + """ + + def tokenize(self, text: Optional[str]) -> List[str]: + """Tokenize text with CJK-aware segmentation. + + Args: + text: Input text to tokenize. None or empty string returns []. + + Returns: + List of tokens. CJK characters are individual tokens, + ASCII words are single tokens. + """ + if not text: + return [] + + text = text.lower() + text = CJK_PUNCT_PATTERN.sub(" ", text) + + tokens = [] + i = 0 + n = len(text) + + while i < n: + char = text[i] + + if CJK_CHAR_PATTERN.match(char): + tokens.append(char) + i += 1 + elif _is_ascii_alnum(char): + word_start = i + while i < n and _is_ascii_alnum(text[i]): + i += 1 + tokens.append(text[word_start:i]) + else: + i += 1 + + return tokens + class RougeEvaluator(Evaluator): - """Evaluates if agent's final response matches a golden/expected final response using Rouge_1 metric. + """Evaluates using Rouge_1 metric with optional CJK support. Value range for this metric is [0,1], with values closer to 1 more desirable. + + Warning behavior: + When CJK characters are detected but no tokenizer is specified, + a warning is logged. This warning is logged at most ONCE per + RougeEvaluator instance, even if evaluate_invocations() is called + multiple times. """ + criterion_type: ClassVar[type[BaseCriterion]] = RougeScoreCriterion + def __init__(self, eval_metric: EvalMetric): self._eval_metric = eval_metric + self._tokenizer: Optional[tokenizers.Tokenizer] = None + self._use_stemmer = True + # Warning is logged at most once per instance + self._warned_about_cjk = False + + if eval_metric.criterion: + try: + criterion = RougeScoreCriterion.model_validate( + eval_metric.criterion.model_dump() + ) + if criterion.tokenizer == "cjk": + self._tokenizer = CJKTokenizer() + self._use_stemmer = False # Stemming not applicable to CJK + except ValidationError: + pass # Different criterion type, ignore @override def evaluate_invocations( @@ -55,7 +192,16 @@ def evaluate_invocations( for actual, expected in zip(actual_invocations, expected_invocations): reference = _get_text_from_content(expected.final_response) response = _get_text_from_content(actual.final_response) - rouge_1_scores = _calculate_rouge_1_scores(response, reference) + + # Log warning once if CJK detected without tokenizer + self._maybe_warn_cjk(reference, response) + + rouge_1_scores = _calculate_rouge_1_scores( + response, + reference, + tokenizer=self._tokenizer, + use_stemmer=self._use_stemmer, + ) score = rouge_1_scores.fmeasure per_invocation_results.append( PerInvocationResult( @@ -80,6 +226,21 @@ def evaluate_invocations( return EvaluationResult() + def _maybe_warn_cjk(self, reference: str, response: str) -> None: + """Log warning if CJK detected without tokenizer (once per instance).""" + if self._warned_about_cjk: + return + if self._tokenizer is not None: + return + if _contains_cjk(reference) or _contains_cjk(response): + logger.warning( + "CJK characters detected in text but no tokenizer specified. " + "ROUGE scores will likely be 0.0 for CJK text. " + "Consider using RougeScoreCriterion(tokenizer='cjk') for " + "Chinese, Japanese, or Korean language support." + ) + self._warned_about_cjk = True + def _get_text_from_content(content: Optional[genai_types.Content]) -> str: if content and content.parts: @@ -92,25 +253,37 @@ def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED -def _calculate_rouge_1_scores(candidate: str, reference: str): +def _calculate_rouge_1_scores( + candidate: str, + reference: str, + tokenizer: Optional[tokenizers.Tokenizer] = None, + use_stemmer: bool = True, +): """Calculates the ROUGE-1 score between a candidate and reference text. ROUGE-1 measures the overlap of unigrams (single words) between the candidate and reference texts. The score is broken down into: - Precision: The proportion of unigrams in the candidate that are also in the - reference. + reference. - Recall: The proportion of unigrams in the reference that are also in the - candidate. + candidate. - F-measure: The harmonic mean of precision and recall. Args: candidate: The generated text to be evaluated. reference: The ground-truth text to compare against. + tokenizer: Custom tokenizer (e.g., CJKTokenizer). None for default. + use_stemmer: Whether to use Porter stemmer. Ignored if tokenizer is set. Returns: - A dictionary containing the ROUGE-1 precision, recall, and f-measure. + A Score object containing the ROUGE-1 precision, recall, and f-measure. """ - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + if tokenizer: + scorer = rouge_scorer.RougeScorer( + ["rouge1"], use_stemmer=False, tokenizer=tokenizer + ) + else: + scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=use_stemmer) # The score method returns a dictionary where keys are the ROUGE types # and values are Score objects (tuples) with precision, recall, and fmeasure. diff --git a/src/google/adk/evaluation/response_evaluator.py b/src/google/adk/evaluation/response_evaluator.py index 3fa3754913..c54618f51d 100644 --- a/src/google/adk/evaluation/response_evaluator.py +++ b/src/google/adk/evaluation/response_evaluator.py @@ -58,6 +58,8 @@ def __init__( " metric_name should be specified." ) + self._eval_metric = eval_metric + if eval_metric: threshold = eval_metric.threshold metric_name = eval_metric.metric_name @@ -82,9 +84,12 @@ def evaluate_invocations( ) -> EvaluationResult: # If the metric is response_match_score, just use the RougeEvaluator. if self._metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value: - rouge_evaluator = RougeEvaluator( - EvalMetric(metric_name=self._metric_name, threshold=self._threshold) - ) + if self._eval_metric: + rouge_evaluator = RougeEvaluator(eval_metric=self._eval_metric) + else: + rouge_evaluator = RougeEvaluator( + EvalMetric(metric_name=self._metric_name, threshold=self._threshold) + ) return rouge_evaluator.evaluate_invocations( actual_invocations, expected_invocations, conversation_scenario ) diff --git a/tests/unittests/evaluation/test_final_response_match_v1.py b/tests/unittests/evaluation/test_final_response_match_v1.py index eef35d86d6..7bc8b923bc 100644 --- a/tests/unittests/evaluation/test_final_response_match_v1.py +++ b/tests/unittests/evaluation/test_final_response_match_v1.py @@ -139,3 +139,208 @@ def test_rouge_evaluator_multiple_invocations( expected_score, rel=1e-3 ) assert evaluation_result.overall_eval_status == expected_status + + +# ============================================================================= +# CJK Tokenizer Tests (Issue #4122) +# ============================================================================= + +import logging + +from google.adk.evaluation.eval_metrics import RougeScoreCriterion +from google.adk.evaluation.final_response_match_v1 import _contains_cjk +from google.adk.evaluation.final_response_match_v1 import CJKTokenizer + + +class TestCJKTokenizer: + """Tests for CJKTokenizer tokenization behavior.""" + + def test_tokenize_japanese(self): + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("これはテスト") + assert tokens == ["こ", "れ", "は", "テ", "ス", "ト"] + + def test_tokenize_english(self): + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("This is a test") + assert tokens == ["this", "is", "a", "test"] + + def test_tokenize_mixed_cjk_and_ascii(self): + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("Hello世界World") + assert tokens == ["hello", "世", "界", "world"] + + def test_tokenize_fullwidth_alphanumeric_skipped(self): + """Fullwidth alphanumeric should be skipped.""" + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("ABC123") + assert tokens == [] + + def test_tokenize_greek_skipped(self): + """Greek and other non-CJK scripts should be skipped.""" + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("αβγtest") + assert tokens == ["test"] + + def test_tokenize_empty_string(self): + tokenizer = CJKTokenizer() + assert tokenizer.tokenize("") == [] + + def test_tokenize_none(self): + """None input should return empty list.""" + tokenizer = CJKTokenizer() + assert tokenizer.tokenize(None) == [] + + def test_tokenize_chinese(self): + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("这是测试") + assert tokens == ["这", "是", "测", "试"] + + def test_tokenize_korean(self): + tokenizer = CJKTokenizer() + tokens = tokenizer.tokenize("테스트") + assert len(tokens) == 3 # 3 Hangul syllables + + +class TestContainsCJK: + """Tests for _contains_cjk helper function.""" + + def test_contains_cjk_japanese(self): + assert _contains_cjk("これはテスト") is True + + def test_contains_cjk_english(self): + assert _contains_cjk("This is a test") is False + + def test_contains_cjk_mixed(self): + assert _contains_cjk("Hello世界") is True + + def test_contains_cjk_empty(self): + assert _contains_cjk("") is False + + def test_contains_cjk_none(self): + assert _contains_cjk(None) is False + + +class TestRougeScoreWithCJKTokenizer: + """Tests for ROUGE score calculation with CJK tokenizer.""" + + def test_english_identical_default_tokenizer(self): + """English identical text should score 1.0 with default tokenizer.""" + result = self._evaluate("This is a test", "This is a test", None) + assert result.overall_score == pytest.approx(1.0) + + def test_english_partial_default_tokenizer(self): + """English partial match should score between 0 and 1.""" + result = self._evaluate("This is test", "This is a test", None) + assert 0 < result.overall_score < 1 + + def test_japanese_without_tokenizer_scores_zero(self): + """Japanese text without CJK tokenizer should score 0.0.""" + result = self._evaluate("これはテスト", "これはテスト", None) + assert result.overall_score == pytest.approx(0.0) + + def test_japanese_identical_with_cjk_tokenizer(self): + """Japanese identical text with CJK tokenizer should score 1.0.""" + result = self._evaluate("これはテスト", "これはテスト", "cjk") + assert result.overall_score == pytest.approx(1.0) + + def test_japanese_partial_with_cjk_tokenizer(self): + """Japanese partial match should score between 0 and 1.""" + result = self._evaluate("これはテスト", "これはサンプル", "cjk") + assert 0 < result.overall_score < 1 + + def test_chinese_identical_with_cjk_tokenizer(self): + """Chinese identical text with CJK tokenizer should score 1.0.""" + result = self._evaluate("这是测试", "这是测试", "cjk") + assert result.overall_score == pytest.approx(1.0) + + def test_mixed_text_identical_with_cjk_tokenizer(self): + """Mixed CJK+ASCII identical text should score 1.0.""" + result = self._evaluate("Hello世界", "Hello世界", "cjk") + assert result.overall_score == pytest.approx(1.0) + + def test_cjk_punctuation_does_not_affect_score(self): + """CJK punctuation should be removed, not affecting score.""" + result_with = self._evaluate("これはテスト。", "これはテスト", "cjk") + result_without = self._evaluate("これはテスト", "これはテスト", "cjk") + assert result_with.overall_score == pytest.approx(1.0) + assert result_without.overall_score == pytest.approx(1.0) + + def _evaluate(self, candidate: str, reference: str, tokenizer_type: str): + """Helper to evaluate ROUGE score.""" + criterion = None + if tokenizer_type: + criterion = RougeScoreCriterion(threshold=0.8, tokenizer=tokenizer_type) + + eval_metric = EvalMetric( + metric_name="response_match_score", + threshold=0.8, + criterion=criterion, + ) + evaluator = RougeEvaluator(eval_metric=eval_metric) + + actual, expected = _create_test_invocations(candidate, reference) + + return evaluator.evaluate_invocations([actual], [expected]) + + +class TestCJKWarning: + """Tests for CJK detection warning behavior.""" + + def test_warning_logged_once_for_multiple_evaluations(self, caplog): + """Warning should be logged exactly once per evaluator instance.""" + eval_metric = EvalMetric( + metric_name="response_match_score", + threshold=0.8, + ) + evaluator = RougeEvaluator(eval_metric=eval_metric) + + actual1, expected1 = _create_test_invocations( + "これはテスト", "これはテスト" + ) + actual2, expected2 = _create_test_invocations("別のテスト", "別のテスト") + + with caplog.at_level(logging.WARNING): + # First evaluation with CJK - should trigger warning + evaluator.evaluate_invocations([actual1], [expected1]) + # Second evaluation with CJK - should NOT trigger warning + evaluator.evaluate_invocations([actual2], [expected2]) + + cjk_warnings = [r for r in caplog.records if "CJK" in r.message] + assert len(cjk_warnings) == 1 + + def test_no_warning_when_cjk_tokenizer_specified(self, caplog): + """No warning when CJK tokenizer is properly specified.""" + criterion = RougeScoreCriterion(threshold=0.8, tokenizer="cjk") + eval_metric = EvalMetric( + metric_name="response_match_score", + threshold=0.8, + criterion=criterion, + ) + evaluator = RougeEvaluator(eval_metric=eval_metric) + + actual, expected = _create_test_invocations("これはテスト", "これはテスト") + + with caplog.at_level(logging.WARNING): + evaluator.evaluate_invocations([actual], [expected]) + + cjk_warnings = [r for r in caplog.records if "CJK" in r.message] + assert len(cjk_warnings) == 0 + + def test_no_warning_for_english_text(self, caplog): + """No warning for ASCII-only text.""" + eval_metric = EvalMetric( + metric_name="response_match_score", + threshold=0.8, + ) + evaluator = RougeEvaluator(eval_metric=eval_metric) + + actual, expected = _create_test_invocations( + "This is a test", "This is a test" + ) + + with caplog.at_level(logging.WARNING): + evaluator.evaluate_invocations([actual], [expected]) + + cjk_warnings = [r for r in caplog.records if "CJK" in r.message] + assert len(cjk_warnings) == 0 From a5bae8d2a68d22f2848a5272f7455b6d90ae7e7b Mon Sep 17 00:00:00 2001 From: maru0804 Date: Tue, 13 Jan 2026 22:42:05 +0900 Subject: [PATCH 2/2] refactor(evaluation): Simplify CJKTokenizer and optimize RougeScorer instantiation Address Gemini Code Assist review feedback: 1. Simplify CJKTokenizer.tokenize() using re.findall instead of manual loop - More concise and idiomatic Python - Same functionality with fewer lines of code 2. Optimize RougeScorer instantiation for performance - Create RougeScorer once in __init__ instead of per invocation - Reuse self._scorer across all evaluate_invocations calls - Avoids unnecessary object creation in loops --- .../adk/evaluation/final_response_match_v1.py | 69 +++++++------------ 1 file changed, 24 insertions(+), 45 deletions(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 6126301353..fbeae957fa 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -62,19 +62,8 @@ CJK_PUNCT_PATTERN = re.compile(f"[{CJK_PUNCTUATION}]") -def _is_ascii_alnum(char: str) -> bool: - """Check if char is lowercase ASCII alphanumeric. - - This function is designed to be called AFTER text.lower(). - It only matches 'a'-'z' and '0'-'9', not 'A'-'Z'. - - Args: - char: A single character (assumed to be lowercase). - - Returns: - True if char is in 'a'-'z' or '0'-'9'. - """ - return ("a" <= char <= "z") or ("0" <= char <= "9") +# Regex pattern for tokenization: matches CJK characters or ASCII alphanumeric words +_CJK_TOKEN_PATTERN = re.compile(f"[{CJK_RANGES}]|[a-z0-9]+") def _contains_cjk(text: str) -> bool: @@ -121,26 +110,7 @@ def tokenize(self, text: Optional[str]) -> List[str]: text = text.lower() text = CJK_PUNCT_PATTERN.sub(" ", text) - - tokens = [] - i = 0 - n = len(text) - - while i < n: - char = text[i] - - if CJK_CHAR_PATTERN.match(char): - tokens.append(char) - i += 1 - elif _is_ascii_alnum(char): - word_start = i - while i < n and _is_ascii_alnum(text[i]): - i += 1 - tokens.append(text[word_start:i]) - else: - i += 1 - - return tokens + return _CJK_TOKEN_PATTERN.findall(text) class RougeEvaluator(Evaluator): @@ -159,22 +129,35 @@ class RougeEvaluator(Evaluator): def __init__(self, eval_metric: EvalMetric): self._eval_metric = eval_metric - self._tokenizer: Optional[tokenizers.Tokenizer] = None - self._use_stemmer = True # Warning is logged at most once per instance self._warned_about_cjk = False + tokenizer: Optional[tokenizers.Tokenizer] = None + use_stemmer = True + if eval_metric.criterion: try: criterion = RougeScoreCriterion.model_validate( eval_metric.criterion.model_dump() ) if criterion.tokenizer == "cjk": - self._tokenizer = CJKTokenizer() - self._use_stemmer = False # Stemming not applicable to CJK + tokenizer = CJKTokenizer() + use_stemmer = False # Stemming not applicable to CJK except ValidationError: pass # Different criterion type, ignore + # Create scorer once for reuse across invocations (performance optimization) + if tokenizer: + self._scorer = rouge_scorer.RougeScorer( + ["rouge1"], use_stemmer=False, tokenizer=tokenizer + ) + self._has_cjk_tokenizer = True + else: + self._scorer = rouge_scorer.RougeScorer( + ["rouge1"], use_stemmer=use_stemmer + ) + self._has_cjk_tokenizer = False + @override def evaluate_invocations( self, @@ -196,13 +179,9 @@ def evaluate_invocations( # Log warning once if CJK detected without tokenizer self._maybe_warn_cjk(reference, response) - rouge_1_scores = _calculate_rouge_1_scores( - response, - reference, - tokenizer=self._tokenizer, - use_stemmer=self._use_stemmer, - ) - score = rouge_1_scores.fmeasure + # Use pre-created scorer for performance + scores = self._scorer.score(reference, response) + score = scores["rouge1"].fmeasure per_invocation_results.append( PerInvocationResult( actual_invocation=actual, @@ -230,7 +209,7 @@ def _maybe_warn_cjk(self, reference: str, response: str) -> None: """Log warning if CJK detected without tokenizer (once per instance).""" if self._warned_about_cjk: return - if self._tokenizer is not None: + if self._has_cjk_tokenizer: return if _contains_cjk(reference) or _contains_cjk(response): logger.warning(