Skip to content

Tools

Codegen

Generate pydantic models

generate pydantic models example

sc_schema.py

from typing import Optional

from scrape_schema import BaseSchema, Parsel, sc_param, Nested
from scrape_schema.codegen import generate_pydantic_schema

class Item(BaseSchema):
    foo: str = Parsel().xpath("//a").get()
    bar: float = Parsel().xpath("//ul/li").get()

    @sc_param
    def spammers(self) -> list[int]:
        return [1, 2, 3]

class Schema(BaseSchema):
    h: str = Parsel().xpath("//h1/text()").get()
    a: str = Parsel(default="0").xpath("").get()
    b: int = Parsel(alias="alias").xpath("a").get()
    c: str = Parsel(alias="alias2", default="def")
    item: Item = Nested(Parsel().xpath("//div"))
    items: list[Item] = Nested(Parsel().xpath("//div"))
    opt: Optional[int] = Parsel().get()

    @sc_param
    def spam(self):
        return "spam"

    @sc_param
    def egg(self) -> int:
        return 1

if __name__ == '__main__':
    # pass main schema entrypoint
    print(generate_pydantic_schema(Schema))

write output to new file:

models.py

from typing import Any, Optional
from pydantic import BaseModel, Field


class Item(BaseModel):
    foo: str
    bar: float
    spammers: list[int]


class Schema(BaseModel):
    h: str
    a: str = '0'
    b: int = Field(alias='alias')
    c: str = Field(default='def', alias='alias2')
    item: Item
    items: list[Item]
    opt: Optional[int]
    spam: Any
    egg: int

example usage

from sc_schema import Schema
from models import Schema as PdSchema

if __name__ == '__main__':
    markup = "" # any valid markup
    print(PdSchema(**Schema(markup).dict()))

Generate python code

Warning

This tool is under development, new features will be added later

If for some reason you do not want to use this library in your projects, then you can generate code

from scrape_schema import BaseSchema, Parsel, Sc
from scrape_schema.codegen import generate_code


class Schema(BaseSchema):
    text: Sc[str, Parsel().css("h1::text").get()]
    words: Sc[list[str], Parsel().xpath("//h1/text()").re(r"\w+")]
    urls: Sc[list[str], Parsel().css("ul > li").xpath(".//@href").getall()]
    sample_jmespath_1: Sc[str, Parsel().css("script::text").jmespath("a").get()]
    sample_jmespath_2: Sc[
        list[str], Parsel().css("script::text").jmespath("a").getall()
    ]


print(generate_code(Schema))

Output:

# Created by scrape-schema codegen
#
# WARNING: Any manual changes made to this file will be lost when generator is
# run again.  Do not edit this file unless you know what you are doing.

from typing import Dict, Union, Any
import re

from parsel import Selector, SelectorList
import chompjs


class Schema:
    def __init__(self):
        self.__scope: Dict[str, Any] = {}

    @staticmethod
    def _prepare_markup(markup: Union[str, bytes, Selector, SelectorList]):
        if isinstance(markup, str):
            return Selector(markup)
        elif isinstance(markup, bytes):
            return Selector(body=markup)
        elif isinstance(markup, (Selector, SelectorList)):
            return markup
        msg = (f'markup should be [str, bytes, Selector, SelectorList], '
               f'not {type(markup).__name__}')
        raise TypeError(msg)

    def parse(self, markup: Union[str, bytes, Selector, SelectorList]) -> Dict[str, Any]:
        markup = self._prepare_markup(markup)
        return {
            'text': self.__parse_text(markup),
            'words': self.__parse_words(markup),
            'urls': self.__parse_urls(markup),
            'sample_jmespath_1': self.__parse_sample_jmespath_1(markup),
            'sample_jmespath_2': self.__parse_sample_jmespath_2(markup),
            }

    @property
    def cache(self) -> Dict[str, Any]:
        """get last parsed data"""
        return self.__scope

    def __parse_text(self, markup: Union[Selector, SelectorList]) -> Any:
        """Parsel(auto_type=True, default=Ellipsis, alias=None).css('h1::text').get() signature"""
        name = 'text'
        default = ...
        try:
            result = markup.css('h1::text')
            result = result.get()
        except Exception as exc:
            if default is Ellipsis:
                raise exc
            result = default
        self.__scope[name] = result
        return result

    def __parse_words(self, markup: Union[Selector, SelectorList]) -> Any:
        """Parsel(auto_type=True, default=Ellipsis, alias=None).xpath('//h1/text()').re('\\w+', True) signature"""
        name = 'words'
        default = ...
        try:
            result = markup.xpath('//h1/text()')
            result = result.re(r'\w+', True)
        except Exception as exc:
            if default is Ellipsis:
                raise exc
            result = default
        self.__scope[name] = result
        return result

    def __parse_urls(self, markup: Union[Selector, SelectorList]) -> Any:
        """Parsel(auto_type=True, default=Ellipsis, alias=None).css('ul > li').xpath('.//@href').getall() signature"""
        name = 'urls'
        default = ...
        try:
            result = markup.css('ul > li')
            result = result.xpath('.//@href')
            result = result.getall()
        except Exception as exc:
            if default is Ellipsis:
                raise exc
            result = default
        self.__scope[name] = result
        return result

    def __parse_sample_jmespath_1(self, markup: Union[Selector, SelectorList]) -> Any:
        """Parsel(auto_type=True, default=Ellipsis, alias=None).css('script::text').jmespath('a').get() signature"""
        name = 'sample_jmespath_1'
        default = ...
        try:
            result = markup.css('script::text')
            result = result.jmespath('a')
            result = result.get()
        except Exception as exc:
            if default is Ellipsis:
                raise exc
            result = default
        self.__scope[name] = result
        return result

    def __parse_sample_jmespath_2(self, markup: Union[Selector, SelectorList]) -> Any:
        """Parsel(auto_type=True, default=Ellipsis, alias=None).css('script::text').jmespath('a').getall() signature"""
        name = 'sample_jmespath_2'
        default = ...
        try:
            result = markup.css('script::text')
            result = result.jmespath('a')
            result = result.getall()
        except Exception as exc:
            if default is Ellipsis:
                raise exc
            result = default
        self.__scope[name] = result
        return result

Roadmap

  • [x] base codegen
  • [x] dict serialize
  • [x] pydantic serialize
  • [ ] attrs serialize
  • [ ] dataclass serialize
  • [ ] codegen output optimizations