forked from cartsnitch/cartsnitch
366 lines
14 KiB
Python
366 lines
14 KiB
Python
"""Regression tests: rate limiting and retry behavior.
|
|
|
|
Validates that scrapers enforce human-like delays between requests
|
|
and handle rate-limit/error responses gracefully without infinite retries.
|
|
"""
|
|
|
|
from datetime import UTC, datetime, timedelta
|
|
from unittest.mock import AsyncMock, patch
|
|
|
|
import pytest
|
|
|
|
from receiptwitness.scrapers.base import SessionData
|
|
from receiptwitness.scrapers.kroger import DEFAULT_USER_AGENT, KrogerScraper
|
|
from receiptwitness.scrapers.meijer import MeijerScraper
|
|
|
|
|
|
class TestHumanDelayBehavior:
|
|
"""Verify that human_delay respects configured bounds."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delay_within_bounds(self):
|
|
"""human_delay should sleep between min_ms/1000 and max_ms/1000 seconds."""
|
|
scraper = KrogerScraper()
|
|
sleep_path = "receiptwitness.scrapers.base.asyncio.sleep"
|
|
with patch(sleep_path, new_callable=AsyncMock) as mock_sleep:
|
|
await scraper.human_delay(100, 200)
|
|
mock_sleep.assert_called_once()
|
|
delay = mock_sleep.call_args[0][0]
|
|
assert 0.1 <= delay <= 0.2
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delay_uses_settings_defaults(self):
|
|
"""Without explicit args, should use settings.min/max_request_delay_ms."""
|
|
scraper = MeijerScraper()
|
|
sleep_path = "receiptwitness.scrapers.base.asyncio.sleep"
|
|
with (
|
|
patch("receiptwitness.scrapers.base.settings") as mock_settings,
|
|
patch(sleep_path, new_callable=AsyncMock) as mock_sleep,
|
|
):
|
|
mock_settings.min_request_delay_ms = 1000
|
|
mock_settings.max_request_delay_ms = 5000
|
|
await scraper.human_delay()
|
|
mock_sleep.assert_called_once()
|
|
delay = mock_sleep.call_args[0][0]
|
|
assert 1.0 <= delay <= 5.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delay_is_randomized(self):
|
|
"""Multiple calls should produce different delays (probabilistic)."""
|
|
scraper = KrogerScraper()
|
|
delays = []
|
|
sleep_path2 = "receiptwitness.scrapers.base.asyncio.sleep"
|
|
with patch(sleep_path2, new_callable=AsyncMock) as mock_sleep:
|
|
for _ in range(20):
|
|
await scraper.human_delay(100, 5000)
|
|
delays.append(mock_sleep.call_args[0][0])
|
|
# With range 100-5000ms, 20 calls should have at least 2 distinct values
|
|
assert len(set(delays)) >= 2
|
|
|
|
|
|
class TestKrogerRateLimiting:
|
|
"""Verify Kroger scraper calls human_delay between receipt fetches."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delay_called_between_receipts(self):
|
|
"""Scraper must call human_delay for each receipt detail fetch."""
|
|
scraper = KrogerScraper()
|
|
valid_session = SessionData(
|
|
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
|
|
user_agent=DEFAULT_USER_AGENT,
|
|
created_at=datetime.now(UTC),
|
|
expires_at=datetime.now(UTC) + timedelta(hours=2),
|
|
)
|
|
|
|
mock_api_response = AsyncMock()
|
|
mock_api_response.ok = True
|
|
mock_api_response.json = AsyncMock(
|
|
return_value={
|
|
"orders": [
|
|
{
|
|
"orderId": f"KR-{i}",
|
|
"purchaseDate": "2026-03-10T14:00:00Z",
|
|
"storeNumber": "357",
|
|
}
|
|
for i in range(3)
|
|
]
|
|
}
|
|
)
|
|
|
|
mock_detail_response = AsyncMock()
|
|
mock_detail_response.ok = True
|
|
mock_detail_response.json = AsyncMock(return_value={})
|
|
|
|
mock_request = AsyncMock()
|
|
mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3)
|
|
|
|
mock_page = AsyncMock()
|
|
mock_page.goto = AsyncMock()
|
|
mock_page.request = mock_request
|
|
|
|
mock_context = AsyncMock()
|
|
mock_context.new_page = AsyncMock(return_value=mock_page)
|
|
mock_context.add_cookies = AsyncMock()
|
|
mock_context.add_init_script = AsyncMock()
|
|
mock_browser = AsyncMock()
|
|
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
|
mock_context.browser = mock_browser
|
|
|
|
mock_pw = AsyncMock()
|
|
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
|
|
|
with (
|
|
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
|
patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay,
|
|
):
|
|
mock_cm = AsyncMock()
|
|
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
|
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
|
mock_apw.return_value = mock_cm
|
|
|
|
receipts = await scraper.scrape_receipts(valid_session)
|
|
|
|
assert len(receipts) == 3
|
|
# human_delay called at least once per receipt (after initial page nav)
|
|
# Plus once for the initial navigation delay
|
|
assert mock_delay.call_count >= 3
|
|
|
|
|
|
class TestMeijerRateLimiting:
|
|
"""Verify Meijer scraper calls human_delay between receipt fetches."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delay_called_between_receipts(self):
|
|
scraper = MeijerScraper()
|
|
valid_session = SessionData(
|
|
cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}],
|
|
user_agent="test",
|
|
created_at=datetime.now(UTC),
|
|
expires_at=datetime.now(UTC) + timedelta(hours=4),
|
|
)
|
|
|
|
mock_api_response = AsyncMock()
|
|
mock_api_response.ok = True
|
|
mock_api_response.json = AsyncMock(
|
|
return_value={
|
|
"transactions": [
|
|
{
|
|
"transactionId": f"TXN-{i}",
|
|
"transactionDate": "2026-03-10T14:00:00Z",
|
|
"storeNumber": "42",
|
|
}
|
|
for i in range(3)
|
|
]
|
|
}
|
|
)
|
|
|
|
mock_detail_response = AsyncMock()
|
|
mock_detail_response.ok = True
|
|
mock_detail_response.json = AsyncMock(return_value={})
|
|
|
|
mock_request = AsyncMock()
|
|
mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3)
|
|
|
|
mock_page = AsyncMock()
|
|
mock_page.goto = AsyncMock()
|
|
mock_page.request = mock_request
|
|
|
|
mock_context = AsyncMock()
|
|
mock_context.new_page = AsyncMock(return_value=mock_page)
|
|
mock_context.add_cookies = AsyncMock()
|
|
mock_context.add_init_script = AsyncMock()
|
|
mock_browser = AsyncMock()
|
|
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
|
mock_context.browser = mock_browser
|
|
|
|
mock_pw = AsyncMock()
|
|
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
|
|
|
with (
|
|
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
|
patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay,
|
|
):
|
|
mock_cm = AsyncMock()
|
|
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
|
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
|
mock_apw.return_value = mock_cm
|
|
|
|
receipts = await scraper.scrape_receipts(valid_session)
|
|
|
|
assert len(receipts) == 3
|
|
assert mock_delay.call_count >= 3
|
|
|
|
|
|
class TestGracefulErrorRecovery:
|
|
"""Scrapers should not retry endlessly on errors."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_kroger_api_500_returns_empty_not_retry(self):
|
|
"""500 error should return empty list, not retry."""
|
|
scraper = KrogerScraper()
|
|
valid_session = SessionData(
|
|
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
|
|
user_agent=DEFAULT_USER_AGENT,
|
|
created_at=datetime.now(UTC),
|
|
expires_at=datetime.now(UTC) + timedelta(hours=2),
|
|
)
|
|
|
|
mock_api_response = AsyncMock()
|
|
mock_api_response.ok = False
|
|
mock_api_response.status = 500
|
|
mock_api_response.status_text = "Internal Server Error"
|
|
|
|
mock_request = AsyncMock()
|
|
mock_request.get = AsyncMock(return_value=mock_api_response)
|
|
|
|
mock_page = AsyncMock()
|
|
mock_page.goto = AsyncMock()
|
|
mock_page.request = mock_request
|
|
|
|
mock_context = AsyncMock()
|
|
mock_context.new_page = AsyncMock(return_value=mock_page)
|
|
mock_context.add_cookies = AsyncMock()
|
|
mock_context.add_init_script = AsyncMock()
|
|
mock_browser = AsyncMock()
|
|
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
|
mock_context.browser = mock_browser
|
|
|
|
mock_pw = AsyncMock()
|
|
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
|
|
|
with (
|
|
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
|
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
|
):
|
|
mock_cm = AsyncMock()
|
|
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
|
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
|
mock_apw.return_value = mock_cm
|
|
|
|
receipts = await scraper.scrape_receipts(valid_session)
|
|
assert receipts == []
|
|
# Should only call the API once — no retries
|
|
assert mock_request.get.call_count == 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_kroger_429_returns_empty_not_retry(self):
|
|
"""Rate limit (429) should return empty, not retry."""
|
|
scraper = KrogerScraper()
|
|
valid_session = SessionData(
|
|
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
|
|
user_agent=DEFAULT_USER_AGENT,
|
|
created_at=datetime.now(UTC),
|
|
expires_at=datetime.now(UTC) + timedelta(hours=2),
|
|
)
|
|
|
|
mock_api_response = AsyncMock()
|
|
mock_api_response.ok = False
|
|
mock_api_response.status = 429
|
|
mock_api_response.status_text = "Too Many Requests"
|
|
|
|
mock_request = AsyncMock()
|
|
mock_request.get = AsyncMock(return_value=mock_api_response)
|
|
|
|
mock_page = AsyncMock()
|
|
mock_page.goto = AsyncMock()
|
|
mock_page.request = mock_request
|
|
|
|
mock_context = AsyncMock()
|
|
mock_context.new_page = AsyncMock(return_value=mock_page)
|
|
mock_context.add_cookies = AsyncMock()
|
|
mock_context.add_init_script = AsyncMock()
|
|
mock_browser = AsyncMock()
|
|
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
|
mock_context.browser = mock_browser
|
|
|
|
mock_pw = AsyncMock()
|
|
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
|
|
|
with (
|
|
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
|
|
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
|
):
|
|
mock_cm = AsyncMock()
|
|
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
|
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
|
mock_apw.return_value = mock_cm
|
|
|
|
receipts = await scraper.scrape_receipts(valid_session)
|
|
assert receipts == []
|
|
assert mock_request.get.call_count == 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_meijer_detail_exception_continues(self):
|
|
"""Exception fetching one receipt detail should not abort remaining receipts."""
|
|
scraper = MeijerScraper()
|
|
valid_session = SessionData(
|
|
cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}],
|
|
user_agent="test",
|
|
created_at=datetime.now(UTC),
|
|
expires_at=datetime.now(UTC) + timedelta(hours=4),
|
|
)
|
|
|
|
mock_api_response = AsyncMock()
|
|
mock_api_response.ok = True
|
|
mock_api_response.json = AsyncMock(
|
|
return_value={
|
|
"transactions": [
|
|
{
|
|
"transactionId": "TXN-1",
|
|
"transactionDate": "2026-03-10T14:00:00Z",
|
|
"storeNumber": "42",
|
|
},
|
|
{
|
|
"transactionId": "TXN-2",
|
|
"transactionDate": "2026-03-11T10:00:00Z",
|
|
"storeNumber": "42",
|
|
},
|
|
]
|
|
}
|
|
)
|
|
|
|
# First detail call raises exception, second succeeds
|
|
mock_detail_fail = AsyncMock()
|
|
mock_detail_fail.ok = False
|
|
mock_detail_fail.status = 500
|
|
|
|
mock_detail_ok = AsyncMock()
|
|
mock_detail_ok.ok = True
|
|
mock_detail_ok.json = AsyncMock(return_value={"items": []})
|
|
|
|
mock_request = AsyncMock()
|
|
mock_request.get = AsyncMock(
|
|
side_effect=[mock_api_response, mock_detail_fail, mock_detail_ok]
|
|
)
|
|
|
|
mock_page = AsyncMock()
|
|
mock_page.goto = AsyncMock()
|
|
mock_page.request = mock_request
|
|
|
|
mock_context = AsyncMock()
|
|
mock_context.new_page = AsyncMock(return_value=mock_page)
|
|
mock_context.add_cookies = AsyncMock()
|
|
mock_context.add_init_script = AsyncMock()
|
|
mock_browser = AsyncMock()
|
|
mock_browser.new_context = AsyncMock(return_value=mock_context)
|
|
mock_context.browser = mock_browser
|
|
|
|
mock_pw = AsyncMock()
|
|
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
|
|
|
|
with (
|
|
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
|
|
patch.object(scraper, "human_delay", new_callable=AsyncMock),
|
|
):
|
|
mock_cm = AsyncMock()
|
|
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
|
|
mock_cm.__aexit__ = AsyncMock(return_value=False)
|
|
mock_apw.return_value = mock_cm
|
|
|
|
receipts = await scraper.scrape_receipts(valid_session)
|
|
|
|
# Both receipts should be returned — the first with empty detail
|
|
assert len(receipts) == 2
|
|
assert receipts[0].raw_data.get("detail") == {}
|
|
assert receipts[1].receipt_id == "TXN-2"
|