Files
receiptwitness/tests/test_pipeline/test_normalization.py
T
Barcode Betty 873f53b9fc fix: resolve CI failures — SQLite incompatibility and ruff lint errors
- Remove PostgreSQL-specific server_default from User.email_inbound_token.
  The column has a Python-side default (secrets.token_urlsafe) that works
  for both SQLite and PostgreSQL. The gen_random_bytes() server_default
  caused sqlite table creation to fail.

- Add missing back_populates relationships to stub models so SQLAlchemy
  mapper configuration succeeds. Purchase.user and Store.user_accounts
  were missing, causing "has no property" errors during Base.metadata.create_all.

- Auto-fix ruff import sorting (I001) across all source and test files.

- Manually fix line-too-long (E501) in config.py.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-19 13:06:47 +00:00

158 lines
5.0 KiB
Python

"""Tests for product normalization module."""
import uuid
from datetime import UTC, datetime
from receiptwitness.pipeline.normalization import (
MatchMethod,
clean_name,
extract_size_info,
jaccard_similarity,
match_by_name,
match_by_upc,
normalize_product,
)
from receiptwitness.shared.models import NormalizedProduct
class TestCleanName:
def test_lowercase(self):
assert clean_name("Kroger WHOLE MILK") == "kroger whole milk"
def test_removes_size_info(self):
assert "oz" not in clean_name("Milk 16 oz Whole")
def test_removes_noise_words(self):
cleaned = clean_name("The Original Brand Milk")
assert "the" not in cleaned.split()
assert "original" not in cleaned.split()
assert "brand" not in cleaned.split()
def test_collapses_whitespace(self):
assert " " not in clean_name("Milk Whole Gallon")
def test_removes_punctuation(self):
cleaned = clean_name("Meijer's Best (Organic) Milk!")
assert "'" not in cleaned
assert "(" not in cleaned
class TestExtractSizeInfo:
def test_extracts_oz(self):
result = extract_size_info("Cereal 18 oz box")
assert result == ("18", "oz")
def test_extracts_fl_oz(self):
result = extract_size_info("Juice 64 fl oz")
assert result == ("64", "fl_oz")
def test_extracts_lb(self):
result = extract_size_info("Ground Beef 1.5 lb")
assert result == ("1.5", "lb")
def test_extracts_ct(self):
result = extract_size_info("Eggs Large 12 ct")
assert result == ("12", "ct")
def test_no_size_returns_none(self):
assert extract_size_info("Bananas") is None
class TestJaccardSimilarity:
def test_identical_strings(self):
assert jaccard_similarity("whole milk gallon", "whole milk gallon") == 1.0
def test_completely_different(self):
assert jaccard_similarity("apple juice", "ground beef") == 0.0
def test_partial_overlap(self):
score = jaccard_similarity("kroger whole milk", "meijer whole milk")
assert 0.4 < score < 0.8 # "whole" and "milk" overlap
def test_empty_strings(self):
assert jaccard_similarity("", "") == 0.0
assert jaccard_similarity("milk", "") == 0.0
class TestMatchByUPC:
def test_match_found(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Whole Milk, Gallon",
upc_variants=["0041250000001", "0041250000002"],
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
# SQLite doesn't support JSONB containment — this will raise
# In production (PostgreSQL), this would work
result = match_by_upc(session, "0041250000001")
assert result is not None
assert result.method == MatchMethod.UPC
assert result.confidence == 1.0
def test_no_match(self, session):
result = match_by_upc(session, "9999999999999")
assert result is None
class TestMatchByName:
def test_exact_name_match(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Whole Milk, Gallon",
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
result = match_by_name(session, "Whole Milk Gallon")
assert result is not None
assert result.method == MatchMethod.NAME
assert result.confidence > 0.5
def test_fuzzy_match(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Kroger Whole Milk, 1 Gallon",
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
result = match_by_name(session, "Meijer Whole Milk 1 Gallon", threshold=0.3)
assert result is not None
assert result.confidence > 0.3
def test_no_match_below_threshold(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Ground Beef 80/20",
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
result = match_by_name(session, "Apple Juice 64 oz", threshold=0.5)
assert result is None
class TestNormalizeProduct:
def test_name_fallback(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Large Eggs, 12 count",
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
result = normalize_product(session, "Large Eggs 12 ct", upc=None)
assert result is not None
assert result.method == MatchMethod.NAME
def test_no_match(self, session):
result = normalize_product(session, "Nonexistent Product XYZ", upc=None)
assert result is None