f47da487da
Extract receiptwitness/ from the monorepo into cartsnitch/receiptwitness. Inline the consumed modules from cartsnitch-common so there is no cross-repo dependency. - Add src/receiptwitness/shared/ with inlined models, schemas, constants, database - Update all imports from cartsnitch_common to receiptwitness.shared - Remove cartsnitch-common dependency from pyproject.toml - Copy and update Alembic config (alembic.ini, alembic/) - Update Dockerfile for standalone build context, add migration CMD - Add CI workflow with lint, test, build, grype scan, deploy-dev, deploy-uat - Add .grype.yaml Co-Authored-By: Paperclip <noreply@paperclip.ing>
159 lines
5.0 KiB
Python
159 lines
5.0 KiB
Python
"""Tests for product normalization module."""
|
|
|
|
import uuid
|
|
from datetime import UTC, datetime
|
|
|
|
from receiptwitness.shared.models import NormalizedProduct
|
|
|
|
from receiptwitness.pipeline.normalization import (
|
|
MatchMethod,
|
|
clean_name,
|
|
extract_size_info,
|
|
jaccard_similarity,
|
|
match_by_name,
|
|
match_by_upc,
|
|
normalize_product,
|
|
)
|
|
|
|
|
|
class TestCleanName:
|
|
def test_lowercase(self):
|
|
assert clean_name("Kroger WHOLE MILK") == "kroger whole milk"
|
|
|
|
def test_removes_size_info(self):
|
|
assert "oz" not in clean_name("Milk 16 oz Whole")
|
|
|
|
def test_removes_noise_words(self):
|
|
cleaned = clean_name("The Original Brand Milk")
|
|
assert "the" not in cleaned.split()
|
|
assert "original" not in cleaned.split()
|
|
assert "brand" not in cleaned.split()
|
|
|
|
def test_collapses_whitespace(self):
|
|
assert " " not in clean_name("Milk Whole Gallon")
|
|
|
|
def test_removes_punctuation(self):
|
|
cleaned = clean_name("Meijer's Best (Organic) Milk!")
|
|
assert "'" not in cleaned
|
|
assert "(" not in cleaned
|
|
|
|
|
|
class TestExtractSizeInfo:
|
|
def test_extracts_oz(self):
|
|
result = extract_size_info("Cereal 18 oz box")
|
|
assert result == ("18", "oz")
|
|
|
|
def test_extracts_fl_oz(self):
|
|
result = extract_size_info("Juice 64 fl oz")
|
|
assert result == ("64", "fl_oz")
|
|
|
|
def test_extracts_lb(self):
|
|
result = extract_size_info("Ground Beef 1.5 lb")
|
|
assert result == ("1.5", "lb")
|
|
|
|
def test_extracts_ct(self):
|
|
result = extract_size_info("Eggs Large 12 ct")
|
|
assert result == ("12", "ct")
|
|
|
|
def test_no_size_returns_none(self):
|
|
assert extract_size_info("Bananas") is None
|
|
|
|
|
|
class TestJaccardSimilarity:
|
|
def test_identical_strings(self):
|
|
assert jaccard_similarity("whole milk gallon", "whole milk gallon") == 1.0
|
|
|
|
def test_completely_different(self):
|
|
assert jaccard_similarity("apple juice", "ground beef") == 0.0
|
|
|
|
def test_partial_overlap(self):
|
|
score = jaccard_similarity("kroger whole milk", "meijer whole milk")
|
|
assert 0.4 < score < 0.8 # "whole" and "milk" overlap
|
|
|
|
def test_empty_strings(self):
|
|
assert jaccard_similarity("", "") == 0.0
|
|
assert jaccard_similarity("milk", "") == 0.0
|
|
|
|
|
|
class TestMatchByUPC:
|
|
def test_match_found(self, session):
|
|
product = NormalizedProduct(
|
|
id=uuid.uuid4(),
|
|
canonical_name="Whole Milk, Gallon",
|
|
upc_variants=["0041250000001", "0041250000002"],
|
|
created_at=datetime.now(UTC),
|
|
updated_at=datetime.now(UTC),
|
|
)
|
|
session.add(product)
|
|
session.commit()
|
|
# SQLite doesn't support JSONB containment — this will raise
|
|
# In production (PostgreSQL), this would work
|
|
result = match_by_upc(session, "0041250000001")
|
|
assert result is not None
|
|
assert result.method == MatchMethod.UPC
|
|
assert result.confidence == 1.0
|
|
|
|
def test_no_match(self, session):
|
|
result = match_by_upc(session, "9999999999999")
|
|
assert result is None
|
|
|
|
|
|
class TestMatchByName:
|
|
def test_exact_name_match(self, session):
|
|
product = NormalizedProduct(
|
|
id=uuid.uuid4(),
|
|
canonical_name="Whole Milk, Gallon",
|
|
created_at=datetime.now(UTC),
|
|
updated_at=datetime.now(UTC),
|
|
)
|
|
session.add(product)
|
|
session.commit()
|
|
result = match_by_name(session, "Whole Milk Gallon")
|
|
assert result is not None
|
|
assert result.method == MatchMethod.NAME
|
|
assert result.confidence > 0.5
|
|
|
|
def test_fuzzy_match(self, session):
|
|
product = NormalizedProduct(
|
|
id=uuid.uuid4(),
|
|
canonical_name="Kroger Whole Milk, 1 Gallon",
|
|
created_at=datetime.now(UTC),
|
|
updated_at=datetime.now(UTC),
|
|
)
|
|
session.add(product)
|
|
session.commit()
|
|
result = match_by_name(session, "Meijer Whole Milk 1 Gallon", threshold=0.3)
|
|
assert result is not None
|
|
assert result.confidence > 0.3
|
|
|
|
def test_no_match_below_threshold(self, session):
|
|
product = NormalizedProduct(
|
|
id=uuid.uuid4(),
|
|
canonical_name="Ground Beef 80/20",
|
|
created_at=datetime.now(UTC),
|
|
updated_at=datetime.now(UTC),
|
|
)
|
|
session.add(product)
|
|
session.commit()
|
|
result = match_by_name(session, "Apple Juice 64 oz", threshold=0.5)
|
|
assert result is None
|
|
|
|
|
|
class TestNormalizeProduct:
|
|
def test_name_fallback(self, session):
|
|
product = NormalizedProduct(
|
|
id=uuid.uuid4(),
|
|
canonical_name="Large Eggs, 12 count",
|
|
created_at=datetime.now(UTC),
|
|
updated_at=datetime.now(UTC),
|
|
)
|
|
session.add(product)
|
|
session.commit()
|
|
result = normalize_product(session, "Large Eggs 12 ct", upc=None)
|
|
assert result is not None
|
|
assert result.method == MatchMethod.NAME
|
|
|
|
def test_no_match(self, session):
|
|
result = normalize_product(session, "Nonexistent Product XYZ", upc=None)
|
|
assert result is None
|