Squashed 'receiptwitness/' content from commit e8d374a

git-subtree-dir: receiptwitness
git-subtree-split: e8d374a89ed8978f429598e02d31b1c5963efe22
This commit is contained in:
Coupon Carl
2026-03-28 02:24:22 +00:00
commit 342906c9d1
53 changed files with 7300 additions and 0 deletions
View File
@@ -0,0 +1,435 @@
"""Regression tests: graceful handling of page layout changes.
Retailers frequently change their API response structures, field names,
and nesting. These tests verify that both parsers degrade gracefully when
encountering alternative or missing fields — producing valid output
instead of crashing.
"""
from decimal import Decimal
from receiptwitness.parsers.kroger import parse_kroger_receipt
from receiptwitness.parsers.meijer import parse_meijer_receipt
from receiptwitness.scrapers.base import RawReceipt
class TestKrogerFieldNameVariations:
"""Kroger changes field names between app versions and API revisions."""
def test_alternative_item_key_line_items(self):
raw = RawReceipt(
receipt_id="KR-ALT-1",
purchase_date="2026-03-12",
raw_data={
"detail": {
"lineItems": [{"description": "MILK", "basePrice": 3.99, "totalPrice": 3.99}],
"total": 3.99,
}
},
)
result = parse_kroger_receipt(raw)
assert len(result["items"]) == 1
assert result["items"][0]["product_name_raw"] == "MILK"
def test_alternative_item_key_receipt_items(self):
raw = RawReceipt(
receipt_id="KR-ALT-2",
purchase_date="2026-03-12",
raw_data={
"detail": {
"receiptItems": [
{"description": "EGGS", "basePrice": 5.49, "totalPrice": 5.49}
],
"total": 5.49,
}
},
)
result = parse_kroger_receipt(raw)
assert len(result["items"]) == 1
assert result["items"][0]["product_name_raw"] == "EGGS"
def test_alternative_description_fields(self):
"""Test productName and itemDescription fallbacks."""
for field in ("productName", "itemDescription", "name"):
raw = RawReceipt(
receipt_id="KR-DESC",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [{field: "TEST PRODUCT", "basePrice": 1.00, "totalPrice": 1.00}],
"total": 1.00,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["product_name_raw"] == "TEST PRODUCT"
def test_alternative_price_fields(self):
"""Test unitPrice and price fallbacks for basePrice."""
raw = RawReceipt(
receipt_id="KR-PRICE-1",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [{"description": "ITEM A", "unitPrice": 2.50, "totalPrice": 2.50}],
"total": 2.50,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["unit_price"] == Decimal("2.50")
raw2 = RawReceipt(
receipt_id="KR-PRICE-2",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [{"description": "ITEM B", "price": 4.00, "totalPrice": 4.00}],
"total": 4.00,
}
},
)
result2 = parse_kroger_receipt(raw2)
assert result2["items"][0]["unit_price"] == Decimal("4.00")
def test_alternative_total_fields(self):
"""Test orderTotal, grandTotal fallbacks."""
for field in ("orderTotal", "grandTotal"):
raw = RawReceipt(
receipt_id="KR-TOT",
purchase_date="2026-03-12",
raw_data={field: 42.50, "detail": {}},
)
result = parse_kroger_receipt(raw)
assert result["total"] == Decimal("42.50")
def test_alternative_savings_fields(self):
"""Test youSaved and totalDiscount fallbacks."""
raw = RawReceipt(
receipt_id="KR-SAV-1",
purchase_date="2026-03-12",
raw_data={"youSaved": 5.00, "detail": {}},
)
result = parse_kroger_receipt(raw)
assert result["savings_total"] == Decimal("5.00")
def test_alternative_tax_field(self):
raw = RawReceipt(
receipt_id="KR-TAX",
purchase_date="2026-03-12",
raw_data={"salesTax": 3.25, "detail": {}},
)
result = parse_kroger_receipt(raw)
assert result["tax"] == Decimal("3.25")
def test_alternative_quantity_field_qty(self):
raw = RawReceipt(
receipt_id="KR-QTY",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [
{"description": "APPLES", "qty": 5, "basePrice": 1.00, "totalPrice": 5.00}
],
"total": 5.00,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["quantity"] == Decimal("5")
def test_alternative_upc_field_kroger_product_id(self):
raw = RawReceipt(
receipt_id="KR-UPC",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"krogerProductId": "12345678",
"basePrice": 1.00,
"totalPrice": 1.00,
}
],
"total": 1.00,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["upc"] == "12345678"
def test_missing_extended_price_computed(self):
"""When totalPrice is missing, extended_price = unit_price * quantity."""
raw = RawReceipt(
receipt_id="KR-CALC",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [{"description": "EGGS", "basePrice": 5.49, "quantity": 2}],
"total": 10.98,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["extended_price"] == Decimal("5.49") * Decimal("2")
class TestMeijerFieldNameVariations:
"""Meijer XHR endpoints may change field names between SPA versions."""
def test_alternative_item_key_line_items(self):
raw = RawReceipt(
receipt_id="MJ-ALT-1",
purchase_date="2026-03-10",
raw_data={
"detail": {
"lineItems": [{"description": "BANANAS", "price": 0.69, "extendedPrice": 0.69}],
"total": 0.69,
}
},
)
result = parse_meijer_receipt(raw)
assert len(result["items"]) == 1
assert result["items"][0]["product_name_raw"] == "BANANAS"
def test_alternative_description_fields(self):
for field in ("itemDescription", "name"):
raw = RawReceipt(
receipt_id="MJ-DESC",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [{field: "TEST ITEM", "price": 1.00, "extendedPrice": 1.00}],
"total": 1.00,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["product_name_raw"] == "TEST ITEM"
def test_alternative_price_field_unit_price(self):
raw = RawReceipt(
receipt_id="MJ-PRICE",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [{"description": "MILK", "unitPrice": 3.49, "totalPrice": 3.49}],
"total": 3.49,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["unit_price"] == Decimal("3.49")
def test_alternative_extended_price_field_total_price(self):
raw = RawReceipt(
receipt_id="MJ-EXT",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [{"description": "CEREAL", "price": 4.99, "totalPrice": 4.99}],
"total": 4.99,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["extended_price"] == Decimal("4.99")
def test_alternative_total_field_transaction_total(self):
raw = RawReceipt(
receipt_id="MJ-TOT",
purchase_date="2026-03-10",
raw_data={"transactionTotal": 55.00, "detail": {}},
)
result = parse_meijer_receipt(raw)
assert result["total"] == Decimal("55.00")
def test_alternative_loyalty_field(self):
raw = RawReceipt(
receipt_id="MJ-LOY",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"price": 5.00,
"extendedPrice": 5.00,
"loyaltyDiscount": 0.50,
}
],
"total": 5.00,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["loyalty_discount"] == Decimal("0.50")
def test_alternative_upc_field_uppercase(self):
raw = RawReceipt(
receipt_id="MJ-UPC",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"UPC": "0012345678",
"price": 1.00,
"extendedPrice": 1.00,
}
],
"total": 1.00,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["upc"] == "12345678"
def test_alternative_category_field(self):
raw = RawReceipt(
receipt_id="MJ-CAT",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"price": 1.00,
"extendedPrice": 1.00,
"departmentDescription": "FROZEN",
}
],
"total": 1.00,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["category_raw"] == "FROZEN"
def test_missing_extended_price_computed(self):
raw = RawReceipt(
receipt_id="MJ-CALC",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [{"description": "MILK", "price": 3.49, "quantity": 2}],
"total": 6.98,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["extended_price"] == Decimal("3.49") * Decimal("2")
def test_missing_description_fallback(self):
raw = RawReceipt(
receipt_id="MJ-NODESC",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [{"price": 1.00, "extendedPrice": 1.00}],
"total": 1.00,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["product_name_raw"] == "UNKNOWN ITEM"
class TestMixedFieldVersions:
"""Test receipts that mix field naming conventions (happens during rollouts)."""
def test_kroger_mixed_item_fields(self):
"""Some items use old names, some use new names in same receipt."""
raw = RawReceipt(
receipt_id="KR-MIX",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [
{"description": "OLD STYLE", "basePrice": 2.00, "totalPrice": 2.00},
{"productName": "NEW STYLE", "unitPrice": 3.00, "extendedAmount": 3.00},
],
"total": 5.00,
}
},
)
result = parse_kroger_receipt(raw)
assert len(result["items"]) == 2
assert result["items"][0]["product_name_raw"] == "OLD STYLE"
assert result["items"][0]["unit_price"] == Decimal("2.00")
assert result["items"][1]["product_name_raw"] == "NEW STYLE"
assert result["items"][1]["unit_price"] == Decimal("3.00")
def test_kroger_completely_unknown_structure_no_crash(self):
"""Receipt with unrecognized structure should return empty items."""
raw = RawReceipt(
receipt_id="KR-UNKNOWN",
purchase_date="2026-03-12",
raw_data={"something_unexpected": [1, 2, 3], "detail": {"foo": "bar"}},
)
result = parse_kroger_receipt(raw)
assert result["receipt_id"] == "KR-UNKNOWN"
assert result["items"] == []
def test_meijer_completely_unknown_structure_no_crash(self):
raw = RawReceipt(
receipt_id="MJ-UNKNOWN",
purchase_date="2026-03-10",
raw_data={"something_unexpected": [1, 2, 3], "detail": {"foo": "bar"}},
)
result = parse_meijer_receipt(raw)
assert result["receipt_id"] == "MJ-UNKNOWN"
assert result["items"] == []
def test_kroger_null_fields_no_crash(self):
"""Fields with None values should be handled gracefully."""
raw = RawReceipt(
receipt_id="KR-NULL",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"basePrice": None,
"totalPrice": None,
"quantity": None,
"upc": None,
"department": None,
}
],
"total": None,
"subtotal": None,
"tax": None,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["product_name_raw"] == "ITEM"
assert result["items"][0]["unit_price"] == Decimal("0")
def test_meijer_null_fields_no_crash(self):
raw = RawReceipt(
receipt_id="MJ-NULL",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"price": None,
"extendedPrice": None,
"quantity": None,
"upc": None,
"category": None,
}
],
"total": None,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["product_name_raw"] == "ITEM"
assert result["items"][0]["unit_price"] == Decimal("0")
+365
View File
@@ -0,0 +1,365 @@
"""Regression tests: rate limiting and retry behavior.
Validates that scrapers enforce human-like delays between requests
and handle rate-limit/error responses gracefully without infinite retries.
"""
from datetime import UTC, datetime, timedelta
from unittest.mock import AsyncMock, patch
import pytest
from receiptwitness.scrapers.base import SessionData
from receiptwitness.scrapers.kroger import DEFAULT_USER_AGENT, KrogerScraper
from receiptwitness.scrapers.meijer import MeijerScraper
class TestHumanDelayBehavior:
"""Verify that human_delay respects configured bounds."""
@pytest.mark.asyncio
async def test_delay_within_bounds(self):
"""human_delay should sleep between min_ms/1000 and max_ms/1000 seconds."""
scraper = KrogerScraper()
sleep_path = "receiptwitness.scrapers.base.asyncio.sleep"
with patch(sleep_path, new_callable=AsyncMock) as mock_sleep:
await scraper.human_delay(100, 200)
mock_sleep.assert_called_once()
delay = mock_sleep.call_args[0][0]
assert 0.1 <= delay <= 0.2
@pytest.mark.asyncio
async def test_delay_uses_settings_defaults(self):
"""Without explicit args, should use settings.min/max_request_delay_ms."""
scraper = MeijerScraper()
sleep_path = "receiptwitness.scrapers.base.asyncio.sleep"
with (
patch("receiptwitness.scrapers.base.settings") as mock_settings,
patch(sleep_path, new_callable=AsyncMock) as mock_sleep,
):
mock_settings.min_request_delay_ms = 1000
mock_settings.max_request_delay_ms = 5000
await scraper.human_delay()
mock_sleep.assert_called_once()
delay = mock_sleep.call_args[0][0]
assert 1.0 <= delay <= 5.0
@pytest.mark.asyncio
async def test_delay_is_randomized(self):
"""Multiple calls should produce different delays (probabilistic)."""
scraper = KrogerScraper()
delays = []
sleep_path2 = "receiptwitness.scrapers.base.asyncio.sleep"
with patch(sleep_path2, new_callable=AsyncMock) as mock_sleep:
for _ in range(20):
await scraper.human_delay(100, 5000)
delays.append(mock_sleep.call_args[0][0])
# With range 100-5000ms, 20 calls should have at least 2 distinct values
assert len(set(delays)) >= 2
class TestKrogerRateLimiting:
"""Verify Kroger scraper calls human_delay between receipt fetches."""
@pytest.mark.asyncio
async def test_delay_called_between_receipts(self):
"""Scraper must call human_delay for each receipt detail fetch."""
scraper = KrogerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=2),
)
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"orders": [
{
"orderId": f"KR-{i}",
"purchaseDate": "2026-03-10T14:00:00Z",
"storeNumber": "357",
}
for i in range(3)
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay,
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 3
# human_delay called at least once per receipt (after initial page nav)
# Plus once for the initial navigation delay
assert mock_delay.call_count >= 3
class TestMeijerRateLimiting:
"""Verify Meijer scraper calls human_delay between receipt fetches."""
@pytest.mark.asyncio
async def test_delay_called_between_receipts(self):
scraper = MeijerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}],
user_agent="test",
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=4),
)
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"transactions": [
{
"transactionId": f"TXN-{i}",
"transactionDate": "2026-03-10T14:00:00Z",
"storeNumber": "42",
}
for i in range(3)
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay,
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 3
assert mock_delay.call_count >= 3
class TestGracefulErrorRecovery:
"""Scrapers should not retry endlessly on errors."""
@pytest.mark.asyncio
async def test_kroger_api_500_returns_empty_not_retry(self):
"""500 error should return empty list, not retry."""
scraper = KrogerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=2),
)
mock_api_response = AsyncMock()
mock_api_response.ok = False
mock_api_response.status = 500
mock_api_response.status_text = "Internal Server Error"
mock_request = AsyncMock()
mock_request.get = AsyncMock(return_value=mock_api_response)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert receipts == []
# Should only call the API once — no retries
assert mock_request.get.call_count == 1
@pytest.mark.asyncio
async def test_kroger_429_returns_empty_not_retry(self):
"""Rate limit (429) should return empty, not retry."""
scraper = KrogerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=2),
)
mock_api_response = AsyncMock()
mock_api_response.ok = False
mock_api_response.status = 429
mock_api_response.status_text = "Too Many Requests"
mock_request = AsyncMock()
mock_request.get = AsyncMock(return_value=mock_api_response)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert receipts == []
assert mock_request.get.call_count == 1
@pytest.mark.asyncio
async def test_meijer_detail_exception_continues(self):
"""Exception fetching one receipt detail should not abort remaining receipts."""
scraper = MeijerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}],
user_agent="test",
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=4),
)
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"transactions": [
{
"transactionId": "TXN-1",
"transactionDate": "2026-03-10T14:00:00Z",
"storeNumber": "42",
},
{
"transactionId": "TXN-2",
"transactionDate": "2026-03-11T10:00:00Z",
"storeNumber": "42",
},
]
}
)
# First detail call raises exception, second succeeds
mock_detail_fail = AsyncMock()
mock_detail_fail.ok = False
mock_detail_fail.status = 500
mock_detail_ok = AsyncMock()
mock_detail_ok.ok = True
mock_detail_ok.json = AsyncMock(return_value={"items": []})
mock_request = AsyncMock()
mock_request.get = AsyncMock(
side_effect=[mock_api_response, mock_detail_fail, mock_detail_ok]
)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
# Both receipts should be returned — the first with empty detail
assert len(receipts) == 2
assert receipts[0].raw_data.get("detail") == {}
assert receipts[1].receipt_id == "TXN-2"
@@ -0,0 +1,364 @@
"""Regression tests: scraper output matches expected schema.
Validates that parsed receipts from both Kroger and Meijer conform to the
PurchaseCreate schema contract. Uses recorded fixtures to ensure outputs
remain stable across code changes.
"""
from decimal import Decimal
from receiptwitness.parsers.kroger import parse_kroger_receipt
from receiptwitness.parsers.meijer import parse_meijer_receipt
from receiptwitness.scrapers.base import RawReceipt
# Required top-level keys in a parsed receipt
RECEIPT_REQUIRED_KEYS = {"receipt_id", "purchase_date", "total", "items", "raw_data"}
RECEIPT_OPTIONAL_KEYS = {"subtotal", "tax", "savings_total", "source_url"}
# Required keys in each parsed item
ITEM_REQUIRED_KEYS = {
"product_name_raw",
"upc",
"quantity",
"unit_price",
"extended_price",
}
ITEM_OPTIONAL_KEYS = {
"regular_price",
"sale_price",
"coupon_discount",
"loyalty_discount",
"category_raw",
}
def _validate_receipt_schema(result: dict) -> None:
"""Assert that a parsed receipt dict conforms to the expected schema."""
# All required keys present
for key in RECEIPT_REQUIRED_KEYS:
assert key in result, f"Missing required key: {key}"
# Types
assert isinstance(result["receipt_id"], str)
assert isinstance(result["purchase_date"], str)
assert isinstance(result["total"], Decimal)
assert isinstance(result["items"], list)
assert isinstance(result["raw_data"], dict)
# Optional keys should be correct types when present
if result.get("subtotal") is not None:
assert isinstance(result["subtotal"], Decimal)
if result.get("tax") is not None:
assert isinstance(result["tax"], Decimal)
if result.get("savings_total") is not None:
assert isinstance(result["savings_total"], Decimal)
if result.get("source_url") is not None:
assert isinstance(result["source_url"], str)
# No unexpected keys
all_keys = RECEIPT_REQUIRED_KEYS | RECEIPT_OPTIONAL_KEYS
for key in result:
assert key in all_keys, f"Unexpected key in receipt: {key}"
def _validate_item_schema(item: dict) -> None:
"""Assert that a parsed item dict conforms to the expected schema."""
for key in ITEM_REQUIRED_KEYS:
assert key in item, f"Missing required item key: {key}"
assert isinstance(item["product_name_raw"], str)
assert len(item["product_name_raw"]) > 0
assert isinstance(item["quantity"], Decimal)
assert isinstance(item["unit_price"], Decimal)
assert isinstance(item["extended_price"], Decimal)
# UPC can be None or str
if item["upc"] is not None:
assert isinstance(item["upc"], str)
# UPC should not have leading zeros (stripped during parsing)
assert not item["upc"].startswith("0"), f"UPC has leading zeros: {item['upc']}"
# Optional Decimal fields
for opt_key in ("regular_price", "sale_price", "coupon_discount", "loyalty_discount"):
if item.get(opt_key) is not None:
assert isinstance(item[opt_key], Decimal), f"{opt_key} should be Decimal"
if item.get("category_raw") is not None:
assert isinstance(item["category_raw"], str)
# No unexpected keys
all_keys = ITEM_REQUIRED_KEYS | ITEM_OPTIONAL_KEYS
for key in item:
assert key in all_keys, f"Unexpected key in item: {key}"
class TestKrogerSchemaValidation:
def test_full_receipt_schema(self, kroger_receipt_data):
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
store_number="00357",
raw_data=kroger_receipt_data,
source_url="https://www.kroger.com/atlas/v1/receipt/api?orderId=KR-2026-0312-4471",
)
result = parse_kroger_receipt(raw)
_validate_receipt_schema(result)
for item in result["items"]:
_validate_item_schema(item)
def test_item_count_excludes_voided_and_returned(self, kroger_receipt_data):
"""Fixture has 10 items, 2 should be excluded (voided + returned)."""
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
assert len(result["items"]) == 8
def test_totals_are_positive_decimals(self, kroger_receipt_data):
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
assert result["total"] > Decimal("0")
assert result["subtotal"] > Decimal("0")
assert result["tax"] > Decimal("0")
assert result["savings_total"] > Decimal("0")
def test_receipt_id_preserved(self, kroger_receipt_data):
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
assert result["receipt_id"] == "KR-2026-0312-4471"
def test_known_product_prices(self, kroger_receipt_data):
"""Verify specific products produce correct price extraction."""
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
items_by_name = {i["product_name_raw"]: i for i in result["items"]}
# Milk: $3.99, regular $4.29
milk = items_by_name["KROGER WHOLE MILK GAL"]
assert milk["unit_price"] == Decimal("3.99")
assert milk["regular_price"] == Decimal("4.29")
assert milk["sale_price"] == Decimal("3.99")
# Eggs: qty 2, $5.49 each, total $10.98
eggs = items_by_name["SIMPLE TRUTH ORG EGGS 12CT"]
assert eggs["quantity"] == Decimal("2")
assert eggs["unit_price"] == Decimal("5.49")
assert eggs["extended_price"] == Decimal("10.98")
# Deli turkey: weighted item, 0.68 lb
turkey = items_by_name["KROGER DELI TURKEY BREAST"]
assert turkey["quantity"] == Decimal("0.68")
assert turkey["upc"] is None
def test_multi_quantity_item_correct(self, kroger_receipt_data):
"""Pasta is qty=3, unit=$2.49, total=$7.47."""
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
pasta = [i for i in result["items"] if "PASTA" in i["product_name_raw"]][0]
assert pasta["quantity"] == Decimal("3")
assert pasta["unit_price"] == Decimal("2.49")
assert pasta["extended_price"] == Decimal("7.47")
def test_coupon_discount_captured(self, kroger_receipt_data):
"""Tide Pods has $2.00 coupon."""
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
tide = [i for i in result["items"] if "TIDE" in i["product_name_raw"]][0]
assert tide["coupon_discount"] == Decimal("2.00")
class TestMeijerSchemaValidation:
def test_full_receipt_schema(self, meijer_receipt_data):
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
store_number="42",
raw_data=meijer_receipt_data,
source_url="https://www.meijer.com/bin/meijer/profile/receipt?receiptId=TXN-2026-0310-001",
)
result = parse_meijer_receipt(raw)
_validate_receipt_schema(result)
for item in result["items"]:
_validate_item_schema(item)
def test_item_count_excludes_voided(self, meijer_receipt_data):
"""Fixture has 6 items, 1 should be excluded (voided soda)."""
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
assert len(result["items"]) == 5
def test_totals_are_positive_decimals(self, meijer_receipt_data):
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
assert result["total"] > Decimal("0")
assert result["subtotal"] > Decimal("0")
assert result["tax"] > Decimal("0")
assert result["savings_total"] > Decimal("0")
def test_receipt_id_preserved(self, meijer_receipt_data):
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
assert result["receipt_id"] == "TXN-2026-0310-001"
def test_known_product_prices(self, meijer_receipt_data):
"""Verify specific Meijer products produce correct price extraction."""
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
items_by_name = {i["product_name_raw"]: i for i in result["items"]}
# Bananas: $0.69
bananas = items_by_name["ORGANIC BANANAS"]
assert bananas["unit_price"] == Decimal("0.69")
assert bananas["mperks_discount"] if "mperks_discount" in bananas else True
assert bananas["loyalty_discount"] == Decimal("0.10")
# Milk: qty 2, $3.49 each, total $6.98
milk = items_by_name["MEIJER 2% MILK GAL"]
assert milk["quantity"] == Decimal("2")
assert milk["unit_price"] == Decimal("3.49")
assert milk["extended_price"] == Decimal("6.98")
# Weighted deli turkey: 0.75 lb at $8.99/lb
turkey = items_by_name["WEIGHTED DELI TURKEY"]
assert turkey["quantity"] == Decimal("0.75")
assert turkey["upc"] is None
def test_mperks_discount_captured(self, meijer_receipt_data):
"""Paper towels has $1.00 mPerks discount."""
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
towels = [i for i in result["items"] if "PAPER TOWELS" in i["product_name_raw"]][0]
assert towels["loyalty_discount"] == Decimal("1.00")
assert towels["coupon_discount"] == Decimal("1.00")
def test_cheerios_coupon_discount(self, meijer_receipt_data):
"""Cheerios has $0.50 coupon."""
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
cheerios = [i for i in result["items"] if "CHEERIOS" in i["product_name_raw"]][0]
assert cheerios["coupon_discount"] == Decimal("0.50")
class TestEmptyAndEdgeCaseSchemas:
"""Regression tests for edge-case receipts that should not crash."""
def test_kroger_empty_receipt(self):
raw = RawReceipt(receipt_id="KR-EMPTY", purchase_date="2026-03-12", raw_data={})
result = parse_kroger_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []
assert result["total"] == Decimal("0")
def test_meijer_empty_receipt(self):
raw = RawReceipt(receipt_id="MJ-EMPTY", purchase_date="2026-03-10", raw_data={})
result = parse_meijer_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []
assert result["total"] == Decimal("0")
def test_kroger_receipt_no_detail(self):
raw = RawReceipt(
receipt_id="KR-NODET",
purchase_date="2026-03-12",
raw_data={"total": 50.00},
)
result = parse_kroger_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []
assert result["total"] == Decimal("50.00")
def test_meijer_receipt_no_detail(self):
raw = RawReceipt(
receipt_id="MJ-NODET",
purchase_date="2026-03-10",
raw_data={"total": 30.00},
)
result = parse_meijer_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []
assert result["total"] == Decimal("30.00")
def test_kroger_receipt_all_voided(self):
"""A receipt where every item is voided should have 0 items."""
raw = RawReceipt(
receipt_id="KR-ALLVOID",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [
{"description": "VOIDED A", "basePrice": 5.0, "voided": True},
{"description": "VOIDED B", "basePrice": 3.0, "status": "VOIDED"},
{"description": "RETURNED C", "basePrice": 7.0, "status": "RETURNED"},
{"description": "RETURNED D", "basePrice": 2.0, "returnFlag": True},
],
"total": 0,
}
},
)
result = parse_kroger_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []
def test_meijer_receipt_all_voided(self):
raw = RawReceipt(
receipt_id="MJ-ALLVOID",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [
{"description": "VOIDED A", "price": 5.0, "voided": True},
{"description": "VOIDED B", "price": 3.0, "status": "VOIDED"},
],
"total": 0,
}
},
)
result = parse_meijer_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []