Skip to content

Schema

BaseSchema

Base schema class provide this parsing magic. This class accept str, bytes, Selector and SelectorList objects.

from scrape_schema import BaseSchema, Sc, Parsel
import requests


class Schema(BaseSchema):
    title: Sc[str, Parsel().xpath("//title/text()").get()]
    body: Sc[str, Parsel().xpath("//body/text()").get()]
    urls: Sc[list[str], Parsel(default=[]).xpath("//a/@href").getall()]


if __name__ == '__main__':
    response = requests.get("https://example.com")
    # from string
    print(Schema(response.text).dict())

    # from bytes
    print(Schema(response.content).dict())

    # or from parsel.Selector object
    from parsel import Selector
    selector = Selector(response.text)
    print(Schema(selector).dict())

For various types like Response object (requests, httpx, aiohttp, etc...) - convert value to str, bytes or parsel.Selector object or write an adapter or custom constructor:

import requests

from scrape_schema import BaseSchema, Sc, Parsel


class Schema(BaseSchema):
    title: Sc[str, Parsel().xpath("//title/text()").get()]
    body: Sc[str, Parsel().xpath("//body/text()").get()]
    urls: Sc[list[str], Parsel(default=[]).xpath("//a/@href").getall()]

    @classmethod
    def from_request(cls, response: requests.Response):
        return cls(response.text)


if __name__ == '__main__':
    resp = requests.get("https://example.com")
    print(Schema.from_request(resp))

Configuration

if you need to pass additional arguments to the constructor (for parse XML, example), you can import SchemaConfig class and add it to the schema

from scrape_schema import BaseSchema
from scrape_schema.base import SchemaConfig


# auto add kwargs for new parsel.Selector instances
class XMLConfig(SchemaConfig):
    selector_kwargs = {'type':'xml'}


class Schema(BaseSchema):
    class Config(XMLConfig):
        pass
    # do something

sc_param

property descriptor for dict() method. Useful for additional conversion or reuse of a value from the field

import pprint
from typing import Literal

from parsel import Selector
from scrape_schema import BaseSchema, Sc, Parsel, sc_param


class Schema(BaseSchema):
    url_path: Sc[str, Parsel().xpath("//a/@href").get()]
    _raw_tag: Sc[Selector, Parsel(auto_type=False).xpath("//div")[0]]

    @sc_param
    def div(self) -> dict[str, str]:
        return {"class": self._raw_tag.attrib.get('class'),
                "text": self._raw_tag.xpath(".//text()").get()}

    @sc_param
    def url(self) -> str:
        return f"https://example.com/{self.url_path}"

    @sc_param
    def foo(self) -> Literal["foo"]:
        return "foo"


    # this property will not be returned from the dict() method
    @property
    def bar(self) -> str:
        return "bar"

text = """<a href="/image.png">
<div class="example">hello, scrape schema!</div>
"""
pprint.pprint(Schema(text).dict(), compact=True)
# {'div': {'class': 'example', 'text': 'hello, scrape schema!'},
# 'foo': 'foo',
# 'url': 'https://example.com//image.png',
# 'url_path': '/image.png'}