Skip to content

Code comparison

html

parsel:

from parsel import Selector
import pprint
import requests


def original_parsel(resp: str):
    sel = Selector(resp)
    __RATINGS = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
    data: dict[str, list[dict]] = {"books": []}
    for book_sel in sel.xpath(".//section/div/ol[@class='row']/li"):
        if url := book_sel.xpath('//div[@class="image_container"]/a/@href').get():
            url = f"https://books.toscrape.com/catalogue/{url}"
        if image := book_sel.xpath('//div[@class="image_container"]/a/img/@src').get():
            image = f"https://books.toscrape.com{image[2:]}"
        if price := book_sel.xpath('//div[@class="product_price"]/p[@class="price_color"]/text()').get():
            price = float(price[2:])
        else:
            price = .0
        name = book_sel.xpath("//h3/a/@title").get()
        available = book_sel.xpath('//div[@class="product_price"]/p[@class="instock availability"]/i').attrib.get('class')
        available = ('icon-ok' in available)
        rating = book_sel.xpath('//p[contains(@class, "star-rating")]').attrib.get('class')
        rating = __RATINGS.get(rating.split()[-1], 0)
        data['books'].append(dict(url=url, image=image, price=price, name=name, available=available, rating=rating))
    return data


if __name__ == '__main__':
    response = requests.get("https://books.toscrape.com/catalogue/page-2.html").text
    pprint.pprint(original_parsel(response), compact=True)

scrape_schema:

from typing import List
import pprint
import requests
from scrape_schema import BaseSchema, Sc, Nested, sc_param, Parsel


class Book(BaseSchema):
    __RATINGS = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
    url: Sc[str, (Parsel()
                  .xpath('//div[@class="image_container"]/a/@href')
                  .get()
                  .concat_l("https://books.toscrape.com/catalogue/"))]
    image: Sc[str, (Parsel()
                    .xpath('//div[@class="image_container"]/a/img/@src')
                    .get()[2:]
                    .concat_l("https://books.toscrape.com"))]
    price: Sc[float, (Parsel(default=.0)
                      .xpath('//div[@class="product_price"]/p[@class="price_color"]/text()')
                      .get()[2:])]
    name: Sc[str, Parsel().xpath("//h3/a/@title").get()]
    available: Sc[bool, (Parsel()
                         .xpath('//div[@class="product_price"]/p[@class="instock availability"]/i')
                         .attrib['class']
                         .fn(lambda s: s == 'icon-ok')  # check available tag
                         )]
    _rating: Sc[str, Parsel().xpath('//p[contains(@class, "star-rating")]').attrib.get(key='class')]

    @sc_param
    def rating(self) -> int:
        return self.__RATINGS.get(self._rating.split()[-1], 0)


class MainPage(BaseSchema):
    books: Sc[List[Book], Nested(Parsel().xpath(".//section/div/ol[@class='row']/li").getall())]


if __name__ == '__main__':
    response = requests.get("https://books.toscrape.com/catalogue/page-2.html").text
    pprint.pprint(MainPage(response).dict(), compact=True)

raw text

original re:

import re
import pprint

TEXT = """
banana potato BANANA POTATO
-foo:10
-bar:20
lorem upsum dolor
192.168.0.1
"""


def parse_text(text: str) -> dict:
    if match := re.search(r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})", text):
        ipv4 = match[1]
    else:
        ipv4 = None

    if matches := re.findall(r"(\d+)", text):
        max_digit = max(int(i) for i in matches)
    else:
        max_digit = None

    failed_value = bool(re.search(r"(ora)", text))

    if matches := re.findall(r"(\d+)", text):
        digits = [int(i) for i in matches]
        digits_float = [float(f'{i}.5') for i in matches]
    else:
        digits = None
        digits_float = None
    words_lower = matches if (matches := re.findall(r"([a-z]+)", text)) else None
    words_upper = matches if (matches := re.findall(r"([A-Z]+)", text)) else None

    return dict(ipv4=ipv4, max_digit=max_digit, failed_value=failed_value,
                digits=digits, digits_float=digits_float,
                words_lower=words_lower, words_upper=words_upper)


if __name__ == '__main__':
    pprint.pprint(parse_text(TEXT), width=48, compact=True)
    # {'digits': [10, 20, 192, 168, 0, 1],
    #  'digits_float': [10.5, 20.5, 192.5, 168.5, 0.5,
    #                   1.5],
    #  'failed_value': False,
    #  'ip_v4': '192.168.0.1',
    #  'max_digit': 192,
    #  'words_lower': ['banana', 'potato', 'foo',
    #                  'bar', 'lorem', 'upsum',
    #                  'dolor'],
    #  'words_upper': ['BANANA', 'POTATO']}

scrape_schema:

from typing import List  # if you usage python3.8. If python3.9 - use build-in list
import pprint
from scrape_schema import Text, BaseSchema, Sc, sc_param

# Note: `Sc` is shortcut typing.Annotated

TEXT = """
banana potato BANANA POTATO
-foo:10
-bar:20
lorem upsum dolor
192.168.0.1
"""


class MySchema(BaseSchema):
    ipv4: Sc[str, Text().re_search(r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})")[1]]
    failed_value: Sc[bool, Text(default=False).re_search(r"(ora)")[1]]
    digits: Sc[List[int], Text().re_findall(r"(\d+)")]
    digits_float: Sc[List[float], Text().re_findall(r"(\d+)").fn(lambda lst: [f"{s}.5" for s in lst])]
    words_lower: Sc[List[str], Text().re_findall("([a-z]+)")]
    words_upper: Sc[List[str], Text().re_findall(r"([A-Z]+)")]

    @sc_param
    def sum(self):
        return sum(self.digits)

    @sc_param
    def max_digit(self):
        return max(self.digits)

    @sc_param
    def all_words(self):
        return self.words_lower + self.words_upper


if __name__ == '__main__':
    pprint.pprint(MySchema(TEXT).dict(), compact=True)
# {'all_words': ['banana', 'potato', 'foo', 'bar', 'lorem', 'upsum', 'dolor',
#                'BANANA', 'POTATO'],
#  'digits': [10, 20, 192, 168, 0, 1],
#  'digits_float': [10.5, 20.5, 192.5, 168.5, 0.5, 1.5],
#  'failed_value': False,
#  'ipv4': '192.168.0.1',
#  'max_digit': 192,
#  'sum': 391,
#  'words_lower': ['banana', 'potato', 'foo', 'bar', 'lorem', 'upsum', 'dolor'],
#  'words_upper': ['BANANA', 'POTATO']}