Quick start

So you want to create a new scraper, first you need to make sure you have installed Sneakpeek:

pip install sneakpeek-py

The next step would be implementing scraper logic (or so called scraper handler):

# file: demo_scraper.py

import json
import logging

from pydantic import BaseModel

from sneakpeek.scraper.model import ScraperContextABC, ScraperHandler


# This defines model of handler parameters that are defined
# in the scraper config and then passed to the handler
class DemoScraperParams(BaseModel):
    url: str

# This is a class which actually implements logic
# Note that you need to inherit the implementation from
# the `sneakpeek.scraper_handler.ScraperHandler`
class DemoScraper(ScraperHandler):
    # You can have any dependencies you want and pass them
    # in the server configuration
    def __init__(self) -> None:
        self._logger = logging.getLogger(__name__)

    # Each handler must define its name so it later
    # can be referenced in scrapers' configuration
    @property
    def name(self) -> str:
        return "demo_scraper"

    # Some example function that processes the response
    # and extracts valuable information
    async def process_page(self, response: str):
        ...

    # This function is called by the worker to execute the logic
    # The only argument that is passed is `sneakpeek.scraper_context.ScraperContext`
    # It implements basic async HTTP client and also provides parameters
    # that are defined in the scraper config
    async def run(self, context: ScraperContextABC) -> str:
        params = DemoScraperParams.parse_obj(context.params)
        # Perform GET request to the URL defined in the scraper config
        response = await context.get(params.url)
        response_body = await response.text()

        # Perform some business logic on a response
        result = await self.process_page(response_body)

        # Return meaningful job summary - must return a string
        return json.dumps({
            "processed_urls": 1,
            "found_results": len(result),
        })

Now that we have some scraper logic, let’s make it run periodically. To do so let’s configure SneakpeekServer:

# file: main.py

import random
from uuid import uuid4

from demo.demo_scraper import DemoScraper
from sneakpeek.logging import configure_logging
from sneakpeek.middleware.parser import ParserMiddleware
from sneakpeek.middleware.rate_limiter_middleware import (
    RateLimiterMiddleware,
    RateLimiterMiddlewareConfig,
)
from sneakpeek.middleware.requests_logging_middleware import RequestsLoggingMiddleware
from sneakpeek.middleware.robots_txt_middleware import RobotsTxtMiddleware
from sneakpeek.middleware.user_agent_injecter_middleware import (
    UserAgentInjecterMiddleware,
    UserAgentInjecterMiddlewareConfig,
)
from sneakpeek.queue.in_memory_storage import InMemoryQueueStorage
from sneakpeek.queue.model import TaskPriority
from sneakpeek.scheduler.in_memory_lease_storage import InMemoryLeaseStorage
from sneakpeek.scheduler.model import TaskSchedule
from sneakpeek.scraper.in_memory_storage import InMemoryScraperStorage
from sneakpeek.scraper.model import Scraper
from sneakpeek.server import SneakpeekServer


def get_server(urls: list[str], is_read_only: bool) -> SneakpeekServer:
    handler = DemoScraper()
    return SneakpeekServer.create(
        handlers=[handler],
        scraper_storage=InMemoryScraperStorage([
            Scraper(
                id=str(uuid4()),
                name=f"Demo Scraper",
                schedule=TaskSchedule.EVERY_MINUTE,
                handler=handler.name,
                config=ScraperConfig(params={"start_url": "http://example.com"}),
                schedule_priority=TaskPriority.NORMAL,
            )
        ]),
        queue_storage=InMemoryQueueStorage(),
        lease_storage=InMemoryLeaseStorage(),
        middlewares=[
            RequestsLoggingMiddleware(),
            RobotsTxtMiddleware(),
            RateLimiterMiddleware(RateLimiterMiddlewareConfig(max_rpm=60)),
            UserAgentInjecterMiddleware(
                UserAgentInjecterMiddlewareConfig(use_external_data=False)
            ),
            ParserMiddleware(),
        ],
    )


def main():
    args = parser.parse_args()
    server = get_server(args.urls, args.read_only)
    configure_logging()
    server.serve()


if __name__ == "__main__":
    main()

Now, the only thing is left is to actually run the server:

python3 run main.py

That’s it! Now you can open http://localhost:8080 and explore the UI to see how you scraper is being automatically scheduled and executed.