forked from cartsnitch/cartsnitch
Merge commit '4cf6f91e954b770198578bcb8db5d98ac964bfed' as 'common'
This commit is contained in:
@@ -0,0 +1,156 @@
|
||||
"""Product normalization — Phase 1: UPC matching + fuzzy name matching.
|
||||
|
||||
Matches products across retailers by:
|
||||
1. Exact UPC match (highest confidence)
|
||||
2. Fuzzy name matching via token-based Jaccard similarity (lower confidence)
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import StrEnum
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from cartsnitch_common.models.product import NormalizedProduct
|
||||
|
||||
|
||||
class MatchMethod(StrEnum):
|
||||
"""How a product match was determined."""
|
||||
|
||||
UPC = "upc"
|
||||
NAME = "name"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MatchResult:
|
||||
"""Result of a product normalization attempt."""
|
||||
|
||||
product: NormalizedProduct
|
||||
confidence: float
|
||||
method: MatchMethod
|
||||
|
||||
|
||||
# Noise words stripped during name cleaning
|
||||
_NOISE_WORDS = frozenset(
|
||||
{
|
||||
"the",
|
||||
"a",
|
||||
"an",
|
||||
"and",
|
||||
"or",
|
||||
"of",
|
||||
"with",
|
||||
"in",
|
||||
"for",
|
||||
"to",
|
||||
"brand",
|
||||
"original",
|
||||
"classic",
|
||||
"new",
|
||||
"improved",
|
||||
}
|
||||
)
|
||||
|
||||
# Regex for extracting size info (e.g., "16 oz", "1.5 lb", "12 ct")
|
||||
_SIZE_PATTERN = re.compile(
|
||||
r"(\d+(?:\.\d+)?)\s*(oz|fl\s*oz|lb|lbs|g|kg|ml|l|ct|pk|count|pack)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def clean_name(name: str) -> str:
|
||||
"""Normalize a product name for comparison.
|
||||
|
||||
- Lowercase
|
||||
- Remove size info (e.g., "16 oz")
|
||||
- Strip noise words
|
||||
- Collapse whitespace
|
||||
"""
|
||||
cleaned = name.lower()
|
||||
cleaned = _SIZE_PATTERN.sub("", cleaned)
|
||||
cleaned = re.sub(r"[^\w\s]", " ", cleaned)
|
||||
tokens = cleaned.split()
|
||||
tokens = [t for t in tokens if t not in _NOISE_WORDS]
|
||||
return " ".join(tokens)
|
||||
|
||||
|
||||
def extract_size_info(name: str) -> tuple[str, str] | None:
|
||||
"""Extract (size, unit) from a product name, if present."""
|
||||
match = _SIZE_PATTERN.search(name)
|
||||
if match:
|
||||
return match.group(1), match.group(2).lower().replace(" ", "_")
|
||||
return None
|
||||
|
||||
|
||||
def jaccard_similarity(a: str, b: str) -> float:
|
||||
"""Token-based Jaccard similarity between two cleaned names."""
|
||||
tokens_a = set(a.split())
|
||||
tokens_b = set(b.split())
|
||||
if not tokens_a or not tokens_b:
|
||||
return 0.0
|
||||
intersection = tokens_a & tokens_b
|
||||
union = tokens_a | tokens_b
|
||||
return len(intersection) / len(union)
|
||||
|
||||
|
||||
def match_by_upc(session: Session, upc: str) -> MatchResult | None:
|
||||
"""Find a normalized product by exact UPC match.
|
||||
|
||||
Loads products with upc_variants and checks membership in Python
|
||||
for cross-database compatibility (works on both PostgreSQL and SQLite).
|
||||
"""
|
||||
# TODO: Use PostgreSQL JSON containment query (@>) for production.
|
||||
# Current approach loads all products into memory — acceptable for tests
|
||||
# and small datasets, but will not scale.
|
||||
stmt = select(NormalizedProduct).where(NormalizedProduct.upc_variants.is_not(None))
|
||||
products = session.execute(stmt).scalars().all()
|
||||
for product in products:
|
||||
if product.upc_variants and upc in product.upc_variants:
|
||||
return MatchResult(product=product, confidence=1.0, method=MatchMethod.UPC)
|
||||
return None
|
||||
|
||||
|
||||
def match_by_name(
|
||||
session: Session,
|
||||
name: str,
|
||||
threshold: float = 0.5,
|
||||
) -> MatchResult | None:
|
||||
"""Find the best normalized product by fuzzy name matching.
|
||||
|
||||
Loads all normalized products and computes Jaccard similarity.
|
||||
Returns the best match above the threshold, or None.
|
||||
"""
|
||||
# TODO: Use pg_trgm similarity index for production.
|
||||
# Current approach loads all products into memory — acceptable for tests
|
||||
# and small datasets, but will not scale.
|
||||
cleaned = clean_name(name)
|
||||
stmt = select(NormalizedProduct)
|
||||
products = session.execute(stmt).scalars().all()
|
||||
|
||||
best_match: NormalizedProduct | None = None
|
||||
best_score = 0.0
|
||||
|
||||
for product in products:
|
||||
score = jaccard_similarity(cleaned, clean_name(product.canonical_name))
|
||||
if score > best_score and score >= threshold:
|
||||
best_score = score
|
||||
best_match = product
|
||||
|
||||
if best_match:
|
||||
return MatchResult(product=best_match, confidence=best_score, method=MatchMethod.NAME)
|
||||
return None
|
||||
|
||||
|
||||
def normalize_product(
|
||||
session: Session,
|
||||
name: str,
|
||||
upc: str | None = None,
|
||||
name_threshold: float = 0.5,
|
||||
) -> MatchResult | None:
|
||||
"""Full normalization pipeline: UPC first, then fuzzy name fallback."""
|
||||
if upc:
|
||||
result = match_by_upc(session, upc)
|
||||
if result:
|
||||
return result
|
||||
return match_by_name(session, name, threshold=name_threshold)
|
||||
Reference in New Issue
Block a user