Tools
Codegen
Generate pydantic models
generate pydantic models example
sc_schema.py
from typing import Optional from scrape_schema import BaseSchema, Parsel, sc_param, Nested from scrape_schema.codegen import generate_pydantic_schema class Item(BaseSchema): foo: str = Parsel().xpath("//a").get() bar: float = Parsel().xpath("//ul/li").get() @sc_param def spammers(self) -> list[int]: return [1, 2, 3] class Schema(BaseSchema): h: str = Parsel().xpath("//h1/text()").get() a: str = Parsel(default="0").xpath("").get() b: int = Parsel(alias="alias").xpath("a").get() c: str = Parsel(alias="alias2", default="def") item: Item = Nested(Parsel().xpath("//div")) items: list[Item] = Nested(Parsel().xpath("//div")) opt: Optional[int] = Parsel().get() @sc_param def spam(self): return "spam" @sc_param def egg(self) -> int: return 1 if __name__ == '__main__': # pass main schema entrypoint print(generate_pydantic_schema(Schema))
write output to new file:
models.py
from typing import Any, Optional from pydantic import BaseModel, Field class Item(BaseModel): foo: str bar: float spammers: list[int] class Schema(BaseModel): h: str a: str = '0' b: int = Field(alias='alias') c: str = Field(default='def', alias='alias2') item: Item items: list[Item] opt: Optional[int] spam: Any egg: int
example usage
from sc_schema import Schema
from models import Schema as PdSchema
if __name__ == '__main__':
markup = "" # any valid markup
print(PdSchema(**Schema(markup).dict()))
Generate python code
Warning
This tool is under development, new features will be added later
If for some reason you do not want to use this library in your projects, then you can generate code
from scrape_schema import BaseSchema, Parsel, Sc
from scrape_schema.codegen import generate_code
class Schema(BaseSchema):
text: Sc[str, Parsel().css("h1::text").get()]
words: Sc[list[str], Parsel().xpath("//h1/text()").re(r"\w+")]
urls: Sc[list[str], Parsel().css("ul > li").xpath(".//@href").getall()]
sample_jmespath_1: Sc[str, Parsel().css("script::text").jmespath("a").get()]
sample_jmespath_2: Sc[
list[str], Parsel().css("script::text").jmespath("a").getall()
]
print(generate_code(Schema))
Output:
# Created by scrape-schema codegen
#
# WARNING: Any manual changes made to this file will be lost when generator is
# run again. Do not edit this file unless you know what you are doing.
from typing import Dict, Union, Any
import re
from parsel import Selector, SelectorList
import chompjs
class Schema:
def __init__(self):
self.__scope: Dict[str, Any] = {}
@staticmethod
def _prepare_markup(markup: Union[str, bytes, Selector, SelectorList]):
if isinstance(markup, str):
return Selector(markup)
elif isinstance(markup, bytes):
return Selector(body=markup)
elif isinstance(markup, (Selector, SelectorList)):
return markup
msg = (f'markup should be [str, bytes, Selector, SelectorList], '
f'not {type(markup).__name__}')
raise TypeError(msg)
def parse(self, markup: Union[str, bytes, Selector, SelectorList]) -> Dict[str, Any]:
markup = self._prepare_markup(markup)
return {
'text': self.__parse_text(markup),
'words': self.__parse_words(markup),
'urls': self.__parse_urls(markup),
'sample_jmespath_1': self.__parse_sample_jmespath_1(markup),
'sample_jmespath_2': self.__parse_sample_jmespath_2(markup),
}
@property
def cache(self) -> Dict[str, Any]:
"""get last parsed data"""
return self.__scope
def __parse_text(self, markup: Union[Selector, SelectorList]) -> Any:
"""Parsel(auto_type=True, default=Ellipsis, alias=None).css('h1::text').get() signature"""
name = 'text'
default = ...
try:
result = markup.css('h1::text')
result = result.get()
except Exception as exc:
if default is Ellipsis:
raise exc
result = default
self.__scope[name] = result
return result
def __parse_words(self, markup: Union[Selector, SelectorList]) -> Any:
"""Parsel(auto_type=True, default=Ellipsis, alias=None).xpath('//h1/text()').re('\\w+', True) signature"""
name = 'words'
default = ...
try:
result = markup.xpath('//h1/text()')
result = result.re(r'\w+', True)
except Exception as exc:
if default is Ellipsis:
raise exc
result = default
self.__scope[name] = result
return result
def __parse_urls(self, markup: Union[Selector, SelectorList]) -> Any:
"""Parsel(auto_type=True, default=Ellipsis, alias=None).css('ul > li').xpath('.//@href').getall() signature"""
name = 'urls'
default = ...
try:
result = markup.css('ul > li')
result = result.xpath('.//@href')
result = result.getall()
except Exception as exc:
if default is Ellipsis:
raise exc
result = default
self.__scope[name] = result
return result
def __parse_sample_jmespath_1(self, markup: Union[Selector, SelectorList]) -> Any:
"""Parsel(auto_type=True, default=Ellipsis, alias=None).css('script::text').jmespath('a').get() signature"""
name = 'sample_jmespath_1'
default = ...
try:
result = markup.css('script::text')
result = result.jmespath('a')
result = result.get()
except Exception as exc:
if default is Ellipsis:
raise exc
result = default
self.__scope[name] = result
return result
def __parse_sample_jmespath_2(self, markup: Union[Selector, SelectorList]) -> Any:
"""Parsel(auto_type=True, default=Ellipsis, alias=None).css('script::text').jmespath('a').getall() signature"""
name = 'sample_jmespath_2'
default = ...
try:
result = markup.css('script::text')
result = result.jmespath('a')
result = result.getall()
except Exception as exc:
if default is Ellipsis:
raise exc
result = default
self.__scope[name] = result
return result
Roadmap
- [x] base codegen
- [x] dict serialize
- [x] pydantic serialize
- [ ] attrs serialize
- [ ] dataclass serialize
- [ ] codegen output optimizations