Files
cartsnitch-fork-test/receiptwitness/tests/test_regression/test_rate_limiting.py
T

366 lines
14 KiB
Python

"""Regression tests: rate limiting and retry behavior.
Validates that scrapers enforce human-like delays between requests
and handle rate-limit/error responses gracefully without infinite retries.
"""
from datetime import UTC, datetime, timedelta
from unittest.mock import AsyncMock, patch
import pytest
from receiptwitness.scrapers.base import SessionData
from receiptwitness.scrapers.kroger import DEFAULT_USER_AGENT, KrogerScraper
from receiptwitness.scrapers.meijer import MeijerScraper
class TestHumanDelayBehavior:
"""Verify that human_delay respects configured bounds."""
@pytest.mark.asyncio
async def test_delay_within_bounds(self):
"""human_delay should sleep between min_ms/1000 and max_ms/1000 seconds."""
scraper = KrogerScraper()
sleep_path = "receiptwitness.scrapers.base.asyncio.sleep"
with patch(sleep_path, new_callable=AsyncMock) as mock_sleep:
await scraper.human_delay(100, 200)
mock_sleep.assert_called_once()
delay = mock_sleep.call_args[0][0]
assert 0.1 <= delay <= 0.2
@pytest.mark.asyncio
async def test_delay_uses_settings_defaults(self):
"""Without explicit args, should use settings.min/max_request_delay_ms."""
scraper = MeijerScraper()
sleep_path = "receiptwitness.scrapers.base.asyncio.sleep"
with (
patch("receiptwitness.scrapers.base.settings") as mock_settings,
patch(sleep_path, new_callable=AsyncMock) as mock_sleep,
):
mock_settings.min_request_delay_ms = 1000
mock_settings.max_request_delay_ms = 5000
await scraper.human_delay()
mock_sleep.assert_called_once()
delay = mock_sleep.call_args[0][0]
assert 1.0 <= delay <= 5.0
@pytest.mark.asyncio
async def test_delay_is_randomized(self):
"""Multiple calls should produce different delays (probabilistic)."""
scraper = KrogerScraper()
delays = []
sleep_path2 = "receiptwitness.scrapers.base.asyncio.sleep"
with patch(sleep_path2, new_callable=AsyncMock) as mock_sleep:
for _ in range(20):
await scraper.human_delay(100, 5000)
delays.append(mock_sleep.call_args[0][0])
# With range 100-5000ms, 20 calls should have at least 2 distinct values
assert len(set(delays)) >= 2
class TestKrogerRateLimiting:
"""Verify Kroger scraper calls human_delay between receipt fetches."""
@pytest.mark.asyncio
async def test_delay_called_between_receipts(self):
"""Scraper must call human_delay for each receipt detail fetch."""
scraper = KrogerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=2),
)
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"orders": [
{
"orderId": f"KR-{i}",
"purchaseDate": "2026-03-10T14:00:00Z",
"storeNumber": "357",
}
for i in range(3)
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay,
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 3
# human_delay called at least once per receipt (after initial page nav)
# Plus once for the initial navigation delay
assert mock_delay.call_count >= 3
class TestMeijerRateLimiting:
"""Verify Meijer scraper calls human_delay between receipt fetches."""
@pytest.mark.asyncio
async def test_delay_called_between_receipts(self):
scraper = MeijerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}],
user_agent="test",
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=4),
)
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"transactions": [
{
"transactionId": f"TXN-{i}",
"transactionDate": "2026-03-10T14:00:00Z",
"storeNumber": "42",
}
for i in range(3)
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay,
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 3
assert mock_delay.call_count >= 3
class TestGracefulErrorRecovery:
"""Scrapers should not retry endlessly on errors."""
@pytest.mark.asyncio
async def test_kroger_api_500_returns_empty_not_retry(self):
"""500 error should return empty list, not retry."""
scraper = KrogerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=2),
)
mock_api_response = AsyncMock()
mock_api_response.ok = False
mock_api_response.status = 500
mock_api_response.status_text = "Internal Server Error"
mock_request = AsyncMock()
mock_request.get = AsyncMock(return_value=mock_api_response)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert receipts == []
# Should only call the API once — no retries
assert mock_request.get.call_count == 1
@pytest.mark.asyncio
async def test_kroger_429_returns_empty_not_retry(self):
"""Rate limit (429) should return empty, not retry."""
scraper = KrogerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=2),
)
mock_api_response = AsyncMock()
mock_api_response.ok = False
mock_api_response.status = 429
mock_api_response.status_text = "Too Many Requests"
mock_request = AsyncMock()
mock_request.get = AsyncMock(return_value=mock_api_response)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert receipts == []
assert mock_request.get.call_count == 1
@pytest.mark.asyncio
async def test_meijer_detail_exception_continues(self):
"""Exception fetching one receipt detail should not abort remaining receipts."""
scraper = MeijerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}],
user_agent="test",
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=4),
)
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"transactions": [
{
"transactionId": "TXN-1",
"transactionDate": "2026-03-10T14:00:00Z",
"storeNumber": "42",
},
{
"transactionId": "TXN-2",
"transactionDate": "2026-03-11T10:00:00Z",
"storeNumber": "42",
},
]
}
)
# First detail call raises exception, second succeeds
mock_detail_fail = AsyncMock()
mock_detail_fail.ok = False
mock_detail_fail.status = 500
mock_detail_ok = AsyncMock()
mock_detail_ok.ok = True
mock_detail_ok.json = AsyncMock(return_value={"items": []})
mock_request = AsyncMock()
mock_request.get = AsyncMock(
side_effect=[mock_api_response, mock_detail_fail, mock_detail_ok]
)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
# Both receipts should be returned — the first with empty detail
assert len(receipts) == 2
assert receipts[0].raw_data.get("detail") == {}
assert receipts[1].receipt_id == "TXN-2"