Quickstart
The fields interface is similar to the original parsel API
from scrape_schema import BaseSchema, Parsel, Sc
class Schema(BaseSchema):
h1: Sc[str, Parsel().css('h1::text').get()]
words: Sc[list[str], Parsel().xpath('//h1/text()').re(r'\w+')]
urls: Sc[list[str], Parsel().css('ul > li').xpath('.//@href').getall()]
sample_jmespath_1: Sc[str, Parsel().css(
'script::text').jmespath("a").get()]
sample_jmespath_2: Sc[list[str], Parsel().css(
'script::text').jmespath("a").getall()]
text = """
<html>
<body>
<h1>Hello, Parsel!</h1>
<ul>
<li><a href="http://example.com">Link 1</a></li>
<li><a href="http://scrapy.org">Link 2</a></li>
</ul>
<script type="application/json">{"a": ["b", "c"]}</script>
</body>
</html>"""
print(Schema(text).dict())
# {'h1': 'Hello, Parsel!',
# 'words': ['Hello', 'Parsel'],
# 'urls': ['http://example.com', 'http://scrapy.org'],
# 'sample_jmespath_1': 'b',
# 'sample_jmespath_2': ['b', 'c']}
# mypy: disable-error-code="assignment"
from scrape_schema import BaseSchema, Parsel
class Schema(BaseSchema):
h1: str = Parsel().css('h1::text').get()
words: list[str] = Parsel().xpath('//h1/text()').re(r'\w+')
urls: list[str] = Parsel().css('ul > li').xpath('.//@href').getall()
sample_jmespath_1: str = Parsel().css(
'script::text').jmespath("a").get()
sample_jmespath_2: list[str] = Parsel().css(
'script::text').jmespath("a").getall()
text = """
<html>
<body>
<h1>Hello, Parsel!</h1>
<ul>
<li><a href="http://example.com">Link 1</a></li>
<li><a href="http://scrapy.org">Link 2</a></li>
</ul>
<script type="application/json">{"a": ["b", "c"]}</script>
</body>
</html>"""
print(Schema(text).dict())
# {'h1': 'Hello, Parsel!',
# 'words': ['Hello', 'Parsel'],
# 'urls': ['http://example.com', 'http://scrapy.org'],
# 'sample_jmespath_1': 'b',
# 'sample_jmespath_2': ['b', 'c']}
from parsel import Selector
text = """
<html>
<body>
<h1>Hello, Parsel!</h1>
<ul>
<li><a href="http://example.com">Link 1</a></li>
<li><a href="http://scrapy.org">Link 2</a></li>
</ul>
<script type="application/json">{"a": ["b", "c"]}</script>
</body>
</html>"""
selector = Selector(text=text)
# Hello, Parsel!
print(selector.css('h1::text').get())
# ['Hello', 'Parsel']
print(selector.xpath('//h1/text()').re(r'\w+'))
# http://example.com
# http://scrapy.org
for li in selector.css('ul > li'):
print(li.xpath('.//@href').get())
# b
print(selector.css('script::text').jmespath("a").get())
# ['b', 'c']
print(selector.css('script::text').jmespath("a").getall())