Skip to content

Fields

Bases: property

Shortcut for adding property-like descriptors in BaseSchema, which will go into the output of the dict() method.

Works like build-in @property decorator

Source code in scrape_schema/base.py
class sc_param(property):
    """Shortcut for adding property-like descriptors in BaseSchema,
    which will go into the output of the `dict()` method.

    Works like build-in `@property` decorator"""

    pass  # pragma: no cover
Source code in scrape_schema/base.py
class BaseField:
    def __init__(
        self,
        auto_type: bool = True,
        default: Any = ...,
        alias: Optional[str] = None,
        **kwargs,
    ):
        """Base Field class

        Args:
            auto_type: usage auto_type feature. default True. works in BaseSchema object
            default: default value if parsing runtime will catch an error.
                Throws an error by default
            alias: alias fields to display in the BaseSchema object.
                If no value is specified, will apply the key of the given attribute
        """
        self._stack_methods: List[MarkupMethod] = []
        self.default = default
        self.auto_type = auto_type
        self.is_default = False  # flag check failed parsed value
        self.alias = alias

        self._spec_method_handler: SpecialMethodsHandler = DEFAULT_SPEC_METHOD_HANDLER
        self._last_failed_method: Optional[MarkupMethod] = None
        self._is_success: bool = True  # False - field is failed, True, no errors

    @abstractmethod
    def _prepare_markup(self, markup):
        pass  # pragma: no cover

    @abstractmethod
    def sc_parse(self, markup: Any):
        pass  # pragma: no cover

__init__(auto_type=True, default=..., alias=None, **kwargs)

Base Field class

Parameters:

Name Type Description Default
auto_type bool

usage auto_type feature. default True. works in BaseSchema object

True
default Any

default value if parsing runtime will catch an error. Throws an error by default

...
alias Optional[str]

alias fields to display in the BaseSchema object. If no value is specified, will apply the key of the given attribute

None
Source code in scrape_schema/base.py
def __init__(
    self,
    auto_type: bool = True,
    default: Any = ...,
    alias: Optional[str] = None,
    **kwargs,
):
    """Base Field class

    Args:
        auto_type: usage auto_type feature. default True. works in BaseSchema object
        default: default value if parsing runtime will catch an error.
            Throws an error by default
        alias: alias fields to display in the BaseSchema object.
            If no value is specified, will apply the key of the given attribute
    """
    self._stack_methods: List[MarkupMethod] = []
    self.default = default
    self.auto_type = auto_type
    self.is_default = False  # flag check failed parsed value
    self.alias = alias

    self._spec_method_handler: SpecialMethodsHandler = DEFAULT_SPEC_METHOD_HANDLER
    self._last_failed_method: Optional[MarkupMethod] = None
    self._is_success: bool = True  # False - field is failed, True, no errors

Bases: BaseField

Source code in scrape_schema/base.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
class Field(BaseField):
    def _prepare_markup(
        self, markup: Union[str, bytes, Selector, SelectorList]
    ) -> Union[Selector, SelectorList]:
        """convert string/bytes to parser class context

        Args:
            markup: str, bytes, Selector, SelectorList object

        Returns:
            markup converted to Selector object, if markup arg is str or bytes

        Raises:
            TypeError if markup is not str, bytes, Selector, SelectorList object
        """
        self._last_failed_method = None  # reset failed method link
        _logger.debug("Field markup type: %s", type(markup).__name__)
        if isinstance(markup, (Selector, SelectorList)):
            return markup
        elif isinstance(markup, str):
            return Selector(markup)
        elif isinstance(markup, bytes):
            return Selector(body=markup)
        raise TypeError(f"Unsupported markup type: {type(markup).__name__}")

    @property
    def success(self) -> bool:
        """last parsed field status"""
        return self._is_success

    def _special_method(self, markup: Any, method: MarkupMethod) -> Any:
        """Handle special method

        Args:
            markup: variable for special_method
            method: markup method object

        Returns:
            method execution result
        """
        return self._spec_method_handler.handle(method, markup)

    def __repr__(self):
        args = f"auto_type={self.auto_type}, default={self.default}, alias={self.alias}"
        if self._stack_methods:
            return (  # pragma: no cover
                f"{self.__class__.__name__}({args})."
                f"{'.'.join(repr(m) for m in self._stack_methods)}"
            )
        return f"{self.__class__.__name__}({args})"

    @staticmethod
    def _accept_method(markup: Any, method: MarkupMethod) -> Any:
        """call method

        Args:
            markup: variable for MarkupMethod
            method: markup method object

        Returns:
            method execution result
        """
        if isinstance(method.METHOD_NAME, str):
            class_method = getattr(markup, method.METHOD_NAME)
            # Selector.attrib check case or raw dict
            if isinstance(class_method, (property, dict)):
                return class_method
            # callable
            return class_method(*method.args, **method.kwargs)
        raise TypeError(  # pragma: no cover
            f"`{type(markup).__name__}` is not a valid method name: {method.METHOD_NAME}"
        )

    @staticmethod
    def __log_debug_markup_len(markup: Any) -> int:
        """this method for logging module"""
        if isinstance(markup, NoneType):  # pragma: no cover
            return 0
        elif isinstance(markup, (Selector, SelectorList)):
            markup = markup.get()
            return 0 if isinstance(markup, NoneType) else len(markup)
        return len(str(markup))

    @staticmethod
    def __log_debug_markup_part(markup: Any, max_len: int = 64) -> str:
        """this method for logging module"""
        if isinstance(markup, NoneType):  # pragma: no cover
            return ""
        elif isinstance(markup, (Selector, SelectorList)):
            markup = markup.get()
            return (
                ""
                if isinstance(markup, NoneType)
                else f"{markup[:max_len]}..."
                if len(markup) > max_len
                else markup
            )
        markup = str(markup)
        return f"{markup[:max_len]}..." if len(markup) > max_len else markup

    def _call_stack_methods(self, markup: Any) -> Any:
        """call all passed methods

        Args:
            markup: first markup target

        Returns:
            result of all executed methods

        Raises:
            most often `AttributeError` and `TypeError` due to the absence
            of a method name due to incorrect output data in the call chain
        """
        self._is_success = True  # reset success parsed flag
        result = markup
        _logger.info(
            "Start parse markup. Stack methods count: %s", len(self._stack_methods)
        )
        _logger.debug(
            "Markup (len=%i) target: %s",
            self.__log_debug_markup_len(markup),
            self.__log_debug_markup_part(markup),
        )
        for i, method in enumerate(self._stack_methods, 1):
            try:
                if isinstance(method.METHOD_NAME, SpecialMethods):
                    result = self._special_method(result, method)
                else:
                    result = self._accept_method(result, method)
                _logger.debug(
                    "[%s] %s -> %s", i, method, self.__log_debug_markup_part(result)
                )
            except Exception as e:
                self._is_success = False  # mark failed parse field
                _logger.warning(
                    "Oops, %s throw exception `%s: %s`",
                    str(method).lower(),
                    e.__class__.__name__,
                    e,
                )
                self._last_failed_method = method
                return self._stack_method_error_handler(method, e, markup)
        _logger.info("Call methods done. result=%s", result)
        if self.default is not Ellipsis and result in (None, []):
            return self.default
        return result

    def _stack_method_error_handler(
        self, method: Union[MarkupMethod, SpecialMethods], e: Exception, markup
    ):
        _logger.error("Failed call method: %s", method)
        _logger.error(
            "Full markup:\nSTART\n%s\nEND",
            markup.get() if isinstance(markup, (Selector, SelectorList)) else markup,
        )
        if self.default is Ellipsis:
            _logger.error(
                "Method `%r` return traceback: `%s: %s`",
                method,
                e.__class__.__name__,
                e,
            )
            raise e
        _logger.info(
            "Skip type casting and set default value: %s",
            self.default,
        )
        self.is_default = True
        return self.default

    def sc_parse(self, markup: Union[str, bytes, Selector, SelectorList]) -> Any:
        """parse field entrypoint. Execute all passed methods

        Args:
            markup: markup target

        Returns:
            result of all executed methods
        """
        markup = self._prepare_markup(markup)
        return self._call_stack_methods(markup)

    # build in methods

    def fn(self, function: Callable[..., Any]) -> SpecialMethodsProtocol:
        """call another function and return result

        Args:
            function: function to be executed

        Returns:
            executed function result
        """
        return self.add_method(SpecialMethods.FN, function=function)  # type: ignore

    def concat_l(self, left_string: str) -> SpecialMethodsProtocol:
        """add string to left. Last argument should be string
            value + left_string
        Args:
            left_string:

        Returns:
            concatenated value
        """
        return self.add_method(SpecialMethods.CONCAT_L, left_string)  # type: ignore

    def concat_r(self, right_string: str) -> SpecialMethodsProtocol:
        """add string to right. Last argument should be string
            right_string + value
        Args:
            right_string:

        Returns:
            concatenated string
        """
        return self.add_method(SpecialMethods.CONCAT_R, right_string)  # type: ignore

    def sc_replace(self, old: str, new: str, count: int = -1) -> SpecialMethodsProtocol:
        warnings.warn(
            "Method `sc_replace` "
            "deprecated and will be removed in future releases. Usage `replace method instead`",
            category=DeprecationWarning,
        )
        return self.replace(old, new, count)

    def replace(self, old: str, new: str, count: int = -1) -> SpecialMethodsProtocol:
        """str.replace method. Last argument should be string. old string replaced by new

        Args:
            old: string
            new: string
            count: Maximum number of occurrences to replace. -1 (default) means replace all occurrences.

        Returns:
            replaced string
        """
        return self.add_method(SpecialMethods.REPLACE, old, new, count)  # type: ignore

    def re_search(
        self,
        pattern: Union[str, Pattern[str]],
        flags: Union[int, RegexFlag] = 0,
        groupdict: bool = False,
    ) -> SpecialMethodsProtocol:
        """re.search method for text result.

        Last chain should be return string.

        Args:
            pattern: regex pattern
            flags: regex compilation flags
            groupdict: accept groupdict method. pattern required named groups. default False
        Raises:
            AttributeError: if groupdict=True and pattern not contains named groups
        """
        pattern = re.compile(pattern, flags=flags)
        if groupdict and not pattern.groupindex:
            raise TypeError("groupdict required named groups")
        return self.add_method(  # type: ignore
            SpecialMethods.REGEX_SEARCH, pattern, groupdict, flags
        )

    def re_findall(
        self,
        pattern: Union[str, Pattern[str]],
        flags: Union[int, RegexFlag] = 0,
        groupdict: bool = False,
    ) -> SpecialMethodsProtocol:
        """[match for match in re.finditer(...)] method for text result.

        Last chain should be return string.

        Args:
            pattern: regex pattern
            flags: regex compilation flags
            groupdict: accept groupdict method. patter required named groups. default False

        Raises:
            AttributeError: if groupdict=True and pattern not contains named groups
        """
        pattern = re.compile(pattern, flags=flags)
        if groupdict and not pattern.groupindex:
            raise TypeError("groupdict required named groups")

        return self.add_method(  # type: ignore
            SpecialMethods.REGEX_FINDALL, pattern, groupdict, flags
        )

    def chomp_js_parse(
        self, unicode_escape: Any = False, json_params: Any = None
    ) -> SpecialMethodsProtocol:
        """Extracts first JSON object encountered in the input string

        Args:
            unicode_escape: Attempt to fix input string if it contains escaped special characters
            json_params: Allow passing down standard `json.loads` options

        Returns:
            Extracted JSON object
        """
        return self.add_method(  # type: ignore
            SpecialMethods.CHOMP_JS_PARSE, unicode_escape, json_params
        )

    def chomp_js_parse_all(
        self,
        unicode_escape: Any = False,
        omitempty: Any = False,
        json_params: Any = None,
    ) -> SpecialMethodsProtocol:
        """Returns a list extracting all JSON objects encountered in the input string. Can be used to read JSON Lines

        Args:
            unicode_escape: Attempt to fix input string if it contains escaped special characters
            omitempty: Skip empty dictionaries and lists
            json_params: Allow passing down standard `json.loads` flags

        Returns:
            Iterating over it yields all encountered JSON objects
        """
        return self.add_method(  # type: ignore
            SpecialMethods.CHOMP_JS_PARSE_ALL, unicode_escape, omitempty, json_params
        )

    # 6.0.0 special methods

    def strip(self, chars: Optional[str] = None) -> SpecialMethodsProtocol:
        """Same as `str.strip()` method.

        If last chain list[str] argument - invoke this method to all arguments

        Args:
            chars: strip chars. Default strip all whitespaces (\t, \n included)
        """
        return self.add_method(SpecialMethods.STRIP, chars)  # type: ignore

    def rstrip(self, chars: Optional[str] = None) -> SpecialMethodsProtocol:
        """Same as `str.rstrip()` method.

        If last chain list[str] argument - invoke this method to all arguments

        Args:
            chars: strip chars. Default strip right whitespaces (\t, \n included)
        """
        return self.add_method(SpecialMethods.R_STRIP, chars)  # type: ignore

    def lstrip(self, chars: Optional[str] = None) -> SpecialMethodsProtocol:
        """Same as `str.lstrip()` method.

        If last chain list[str] argument - invoke this method to all arguments

        Args:
            chars: strip chars. Default strip left whitespaces (\t, \n included)
        """
        return self.add_method(SpecialMethods.L_STRIP, chars)  # type: ignore

    def lower(self) -> SpecialMethodsProtocol:
        """Same as `str.lower()` method.

        If last chain list[str] argument - invoke this method to all arguments
        """
        return self.add_method(SpecialMethods.LOWER)  # type: ignore

    def upper(self) -> SpecialMethodsProtocol:
        """Same as `str.upper()` method.

        If last chain list[str] argument - invoke this method to all arguments
        """
        return self.add_method(SpecialMethods.UPPER)  # type: ignore

    def capitalize(self) -> SpecialMethodsProtocol:
        """Same as `str.capitalize()` method.

        If last chain list[str] argument - invoke this method to all arguments
        """
        return self.add_method(SpecialMethods.CAPITALIZE)  # type: ignore

    def split(
        self, sep: Optional[str] = None, max_split: int = -1
    ) -> SpecialMethodsProtocol:
        """Same as `str.split()` method.

        If last chain list[str] argument - raise TypeError
        """
        return self.add_method(SpecialMethods.SPLIT, sep, max_split)  # type: ignore

    def join(self, join_sep: str) -> SpecialMethodsProtocol:
        """Same as `str.join()` method.

        If last chain list[str] argument - raise TypeError

        Args:
            join_sep: separate char for result
        """
        return self.add_method(SpecialMethods.STR_JOIN, join_sep)  # type: ignore

    def count(self) -> SpecialMethodsProtocol:
        """Return items count.

        if last chain value is list - return len value else 1

        """
        return self.add_method(SpecialMethods.COUNT)  # type: ignore

    def add_method(
        self, method_name: Union[str, SpecialMethods], *args, **kwargs
    ) -> Self:
        """low-level interface adding methods to call stack"""
        self._stack_methods.append(MarkupMethod(method_name, args=args, kwargs=kwargs))
        return self

    def __getitem__(self, item: Hashable) -> Self:
        """This method provide __getitem__ API (get by key or slice)

        Args:
            item: key or slice
        """
        return self.add_method("__getitem__", item)

success property

last parsed field status

__getitem__(item)

This method provide getitem API (get by key or slice)

Parameters:

Name Type Description Default
item Hashable

key or slice

required
Source code in scrape_schema/base.py
def __getitem__(self, item: Hashable) -> Self:
    """This method provide __getitem__ API (get by key or slice)

    Args:
        item: key or slice
    """
    return self.add_method("__getitem__", item)

__log_debug_markup_len(markup) staticmethod

this method for logging module

Source code in scrape_schema/base.py
@staticmethod
def __log_debug_markup_len(markup: Any) -> int:
    """this method for logging module"""
    if isinstance(markup, NoneType):  # pragma: no cover
        return 0
    elif isinstance(markup, (Selector, SelectorList)):
        markup = markup.get()
        return 0 if isinstance(markup, NoneType) else len(markup)
    return len(str(markup))

__log_debug_markup_part(markup, max_len=64) staticmethod

this method for logging module

Source code in scrape_schema/base.py
@staticmethod
def __log_debug_markup_part(markup: Any, max_len: int = 64) -> str:
    """this method for logging module"""
    if isinstance(markup, NoneType):  # pragma: no cover
        return ""
    elif isinstance(markup, (Selector, SelectorList)):
        markup = markup.get()
        return (
            ""
            if isinstance(markup, NoneType)
            else f"{markup[:max_len]}..."
            if len(markup) > max_len
            else markup
        )
    markup = str(markup)
    return f"{markup[:max_len]}..." if len(markup) > max_len else markup

add_method(method_name, *args, **kwargs)

low-level interface adding methods to call stack

Source code in scrape_schema/base.py
def add_method(
    self, method_name: Union[str, SpecialMethods], *args, **kwargs
) -> Self:
    """low-level interface adding methods to call stack"""
    self._stack_methods.append(MarkupMethod(method_name, args=args, kwargs=kwargs))
    return self

capitalize()

Same as str.capitalize() method.

If last chain list[str] argument - invoke this method to all arguments

Source code in scrape_schema/base.py
def capitalize(self) -> SpecialMethodsProtocol:
    """Same as `str.capitalize()` method.

    If last chain list[str] argument - invoke this method to all arguments
    """
    return self.add_method(SpecialMethods.CAPITALIZE)  # type: ignore

chomp_js_parse(unicode_escape=False, json_params=None)

Extracts first JSON object encountered in the input string

Parameters:

Name Type Description Default
unicode_escape Any

Attempt to fix input string if it contains escaped special characters

False
json_params Any

Allow passing down standard json.loads options

None

Returns:

Type Description
SpecialMethodsProtocol

Extracted JSON object

Source code in scrape_schema/base.py
def chomp_js_parse(
    self, unicode_escape: Any = False, json_params: Any = None
) -> SpecialMethodsProtocol:
    """Extracts first JSON object encountered in the input string

    Args:
        unicode_escape: Attempt to fix input string if it contains escaped special characters
        json_params: Allow passing down standard `json.loads` options

    Returns:
        Extracted JSON object
    """
    return self.add_method(  # type: ignore
        SpecialMethods.CHOMP_JS_PARSE, unicode_escape, json_params
    )

chomp_js_parse_all(unicode_escape=False, omitempty=False, json_params=None)

Returns a list extracting all JSON objects encountered in the input string. Can be used to read JSON Lines

Parameters:

Name Type Description Default
unicode_escape Any

Attempt to fix input string if it contains escaped special characters

False
omitempty Any

Skip empty dictionaries and lists

False
json_params Any

Allow passing down standard json.loads flags

None

Returns:

Type Description
SpecialMethodsProtocol

Iterating over it yields all encountered JSON objects

Source code in scrape_schema/base.py
def chomp_js_parse_all(
    self,
    unicode_escape: Any = False,
    omitempty: Any = False,
    json_params: Any = None,
) -> SpecialMethodsProtocol:
    """Returns a list extracting all JSON objects encountered in the input string. Can be used to read JSON Lines

    Args:
        unicode_escape: Attempt to fix input string if it contains escaped special characters
        omitempty: Skip empty dictionaries and lists
        json_params: Allow passing down standard `json.loads` flags

    Returns:
        Iterating over it yields all encountered JSON objects
    """
    return self.add_method(  # type: ignore
        SpecialMethods.CHOMP_JS_PARSE_ALL, unicode_escape, omitempty, json_params
    )

concat_l(left_string)

add string to left. Last argument should be string value + left_string Args: left_string:

Returns:

Type Description
SpecialMethodsProtocol

concatenated value

Source code in scrape_schema/base.py
def concat_l(self, left_string: str) -> SpecialMethodsProtocol:
    """add string to left. Last argument should be string
        value + left_string
    Args:
        left_string:

    Returns:
        concatenated value
    """
    return self.add_method(SpecialMethods.CONCAT_L, left_string)  # type: ignore

concat_r(right_string)

add string to right. Last argument should be string right_string + value Args: right_string:

Returns:

Type Description
SpecialMethodsProtocol

concatenated string

Source code in scrape_schema/base.py
def concat_r(self, right_string: str) -> SpecialMethodsProtocol:
    """add string to right. Last argument should be string
        right_string + value
    Args:
        right_string:

    Returns:
        concatenated string
    """
    return self.add_method(SpecialMethods.CONCAT_R, right_string)  # type: ignore

count()

Return items count.

if last chain value is list - return len value else 1

Source code in scrape_schema/base.py
def count(self) -> SpecialMethodsProtocol:
    """Return items count.

    if last chain value is list - return len value else 1

    """
    return self.add_method(SpecialMethods.COUNT)  # type: ignore

fn(function)

call another function and return result

Parameters:

Name Type Description Default
function Callable[..., Any]

function to be executed

required

Returns:

Type Description
SpecialMethodsProtocol

executed function result

Source code in scrape_schema/base.py
def fn(self, function: Callable[..., Any]) -> SpecialMethodsProtocol:
    """call another function and return result

    Args:
        function: function to be executed

    Returns:
        executed function result
    """
    return self.add_method(SpecialMethods.FN, function=function)  # type: ignore

join(join_sep)

Same as str.join() method.

If last chain list[str] argument - raise TypeError

Parameters:

Name Type Description Default
join_sep str

separate char for result

required
Source code in scrape_schema/base.py
def join(self, join_sep: str) -> SpecialMethodsProtocol:
    """Same as `str.join()` method.

    If last chain list[str] argument - raise TypeError

    Args:
        join_sep: separate char for result
    """
    return self.add_method(SpecialMethods.STR_JOIN, join_sep)  # type: ignore

lower()

Same as str.lower() method.

If last chain list[str] argument - invoke this method to all arguments

Source code in scrape_schema/base.py
def lower(self) -> SpecialMethodsProtocol:
    """Same as `str.lower()` method.

    If last chain list[str] argument - invoke this method to all arguments
    """
    return self.add_method(SpecialMethods.LOWER)  # type: ignore

lstrip(chars=None)

Same as str.lstrip() method.

   If last chain list[str] argument - invoke this method to all arguments

   Args:
       chars: strip chars. Default strip left whitespaces (        ,

included)

Source code in scrape_schema/base.py
def lstrip(self, chars: Optional[str] = None) -> SpecialMethodsProtocol:
    """Same as `str.lstrip()` method.

    If last chain list[str] argument - invoke this method to all arguments

    Args:
        chars: strip chars. Default strip left whitespaces (\t, \n included)
    """
    return self.add_method(SpecialMethods.L_STRIP, chars)  # type: ignore

re_findall(pattern, flags=0, groupdict=False)

[match for match in re.finditer(...)] method for text result.

Last chain should be return string.

Parameters:

Name Type Description Default
pattern Union[str, Pattern[str]]

regex pattern

required
flags Union[int, RegexFlag]

regex compilation flags

0
groupdict bool

accept groupdict method. patter required named groups. default False

False

Raises:

Type Description
AttributeError

if groupdict=True and pattern not contains named groups

Source code in scrape_schema/base.py
def re_findall(
    self,
    pattern: Union[str, Pattern[str]],
    flags: Union[int, RegexFlag] = 0,
    groupdict: bool = False,
) -> SpecialMethodsProtocol:
    """[match for match in re.finditer(...)] method for text result.

    Last chain should be return string.

    Args:
        pattern: regex pattern
        flags: regex compilation flags
        groupdict: accept groupdict method. patter required named groups. default False

    Raises:
        AttributeError: if groupdict=True and pattern not contains named groups
    """
    pattern = re.compile(pattern, flags=flags)
    if groupdict and not pattern.groupindex:
        raise TypeError("groupdict required named groups")

    return self.add_method(  # type: ignore
        SpecialMethods.REGEX_FINDALL, pattern, groupdict, flags
    )

re.search method for text result.

Last chain should be return string.

Parameters:

Name Type Description Default
pattern Union[str, Pattern[str]]

regex pattern

required
flags Union[int, RegexFlag]

regex compilation flags

0
groupdict bool

accept groupdict method. pattern required named groups. default False

False

Raises: AttributeError: if groupdict=True and pattern not contains named groups

Source code in scrape_schema/base.py
def re_search(
    self,
    pattern: Union[str, Pattern[str]],
    flags: Union[int, RegexFlag] = 0,
    groupdict: bool = False,
) -> SpecialMethodsProtocol:
    """re.search method for text result.

    Last chain should be return string.

    Args:
        pattern: regex pattern
        flags: regex compilation flags
        groupdict: accept groupdict method. pattern required named groups. default False
    Raises:
        AttributeError: if groupdict=True and pattern not contains named groups
    """
    pattern = re.compile(pattern, flags=flags)
    if groupdict and not pattern.groupindex:
        raise TypeError("groupdict required named groups")
    return self.add_method(  # type: ignore
        SpecialMethods.REGEX_SEARCH, pattern, groupdict, flags
    )

replace(old, new, count=-1)

str.replace method. Last argument should be string. old string replaced by new

Parameters:

Name Type Description Default
old str

string

required
new str

string

required
count int

Maximum number of occurrences to replace. -1 (default) means replace all occurrences.

-1

Returns:

Type Description
SpecialMethodsProtocol

replaced string

Source code in scrape_schema/base.py
def replace(self, old: str, new: str, count: int = -1) -> SpecialMethodsProtocol:
    """str.replace method. Last argument should be string. old string replaced by new

    Args:
        old: string
        new: string
        count: Maximum number of occurrences to replace. -1 (default) means replace all occurrences.

    Returns:
        replaced string
    """
    return self.add_method(SpecialMethods.REPLACE, old, new, count)  # type: ignore

rstrip(chars=None)

Same as str.rstrip() method.

   If last chain list[str] argument - invoke this method to all arguments

   Args:
       chars: strip chars. Default strip right whitespaces (       ,

included)

Source code in scrape_schema/base.py
def rstrip(self, chars: Optional[str] = None) -> SpecialMethodsProtocol:
    """Same as `str.rstrip()` method.

    If last chain list[str] argument - invoke this method to all arguments

    Args:
        chars: strip chars. Default strip right whitespaces (\t, \n included)
    """
    return self.add_method(SpecialMethods.R_STRIP, chars)  # type: ignore

sc_parse(markup)

parse field entrypoint. Execute all passed methods

Parameters:

Name Type Description Default
markup Union[str, bytes, Selector, SelectorList]

markup target

required

Returns:

Type Description
Any

result of all executed methods

Source code in scrape_schema/base.py
def sc_parse(self, markup: Union[str, bytes, Selector, SelectorList]) -> Any:
    """parse field entrypoint. Execute all passed methods

    Args:
        markup: markup target

    Returns:
        result of all executed methods
    """
    markup = self._prepare_markup(markup)
    return self._call_stack_methods(markup)

split(sep=None, max_split=-1)

Same as str.split() method.

If last chain list[str] argument - raise TypeError

Source code in scrape_schema/base.py
def split(
    self, sep: Optional[str] = None, max_split: int = -1
) -> SpecialMethodsProtocol:
    """Same as `str.split()` method.

    If last chain list[str] argument - raise TypeError
    """
    return self.add_method(SpecialMethods.SPLIT, sep, max_split)  # type: ignore

strip(chars=None)

Same as str.strip() method.

   If last chain list[str] argument - invoke this method to all arguments

   Args:
       chars: strip chars. Default strip all whitespaces ( ,

included)

Source code in scrape_schema/base.py
def strip(self, chars: Optional[str] = None) -> SpecialMethodsProtocol:
    """Same as `str.strip()` method.

    If last chain list[str] argument - invoke this method to all arguments

    Args:
        chars: strip chars. Default strip all whitespaces (\t, \n included)
    """
    return self.add_method(SpecialMethods.STRIP, chars)  # type: ignore

upper()

Same as str.upper() method.

If last chain list[str] argument - invoke this method to all arguments

Source code in scrape_schema/base.py
def upper(self) -> SpecialMethodsProtocol:
    """Same as `str.upper()` method.

    If last chain list[str] argument - invoke this method to all arguments
    """
    return self.add_method(SpecialMethods.UPPER)  # type: ignore

Bases: Field

This field provide parsel.Selector api and special methods

Source code in scrape_schema/field.py
class Parsel(Field):
    """This field provide parsel.Selector api and special methods"""

    def __init__(
        self,
        auto_type: bool = True,
        default: Any = ...,
        *,
        raw: bool = False,
        alias: Optional[str] = None,
    ) -> None:
        """Base field provide Parsel.Selector API and special methods

                Args:
                    auto_type: usage auto type feature in BaseSchema scope. Default True
                    default: set default value, if method return traceback.
        Disable auto type andIf not set - raise error
                    raw: raw text parse mode. Auto accept `.xpath("//p/text()").get()` method
                    alias: field alias. default None
        """
        super().__init__(auto_type=auto_type, default=default, alias=alias)
        if raw:
            self.xpath("//p/text()").get()

    def css(self, query: str) -> Self:
        """Apply the given CSS selector and return a SelectorList instance.

        query is a string containing the CSS selector to apply.

        In the background, CSS queries are translated into XPath queries using cssselect
        library and run .xpath() method.
        """
        return self.add_method("css", query)

    @property
    def raw_text(self) -> SpecialMethodsProtocol:
        """Shortcut `Parsel().xpath('//p/text()')` call.

        This method for getting raw text (not html), when calling `parsel.Selector`
        methods will raise an error
        """
        # Parsel is not meant for raw text: it will try to "fix" html and parse as html usage `raw_text` property
        # or `xpath(//p/text()).get()` or raw=True in init constructor
        return self.xpath("//p/text()").get()

    def xpath(
        self, query: str, namespaces: Optional[Mapping[str, str]] = None, **kwargs: Any
    ) -> Self:
        """Xpath selector
        Args:
            query: is a string containing the XPATH query to apply.
            namespaces: is an optional ``prefix: namespace-uri`` mapping (dict) for additional prefixes.
            **kwargs: field alias. default None

        Returns:
            SelectorList

        """

        return self.add_method("xpath", query, namespaces, **kwargs)

    def re(
        self, regex: Union[str, Pattern[str]], replace_entities: bool = True
    ) -> SpecialMethodsProtocol:
        """
        Apply the given regex and return a list of strings with the
        matches.

        Args:
            regex: regular expression
            replace_entities: replace char entity refers replaced by their corresponding char (``&``, ``<``).

        Returns:
            string
        """
        return self.add_method("re", regex, replace_entities)  # type: ignore

    def _is_attrib(self):
        if (method := self._stack_methods[-1].METHOD_NAME) != "attrib":
            _logger.error("Last method should be `attrib, not %s", method)
            raise TypeError(f"Last method should be `attrib`, not `{method}`")
        return True

    def get(
        self, default: Optional[str] = None, key: Optional[Hashable] = None
    ) -> SpecialMethodsProtocol:  # type: ignore
        """Serialize and return the matched nodes in a single string.

        Percent encoded content is unquoted.

        If `key` param passed - get value from attrib property.

        Args:
            default: invoke Selector.get() method and return default value if is None
            key: get value from attrib property *attrib should be called in chain methods*

        Raises:
            TypeError: if passed default and key arguments
        """
        if key and default:
            _logger.error(
                "get should be accept `key` OR `default` param, not `key` AND `default`"
            )
            raise TypeError(
                "get should be accept `key` OR `default` param, not `key` AND `default`"
            )
        elif key:
            if self._is_attrib():
                return self.add_method("get", key)  # type: ignore
        return self.add_method("get", default)  # type: ignore

    def jmespath(self, query: str, **kwargs: Any) -> Self:
        """Find objects matching the JMESPath ``query`` and return the result as a
        SelectorList instance with all elements flattened.
        List elements implement `Selector` interface too.

        Args:
            query: JMESPath string query
            **kwargs:  Any additional named arguments are passed to the underlying
        """
        return self.add_method("jmespath", query, **kwargs)

    def getall(self) -> SpecialMethodsProtocol:
        """Call the .get() method for each element is this list
        and return their results flattened, as a list of strings.

        Returns:
            list[str]
        """
        return self.add_method("getall")  # type: ignore

    @property
    def attrib(self) -> AttribProtocol:
        """
        Return the attributes dictionary for the first element.

        If the list is empty, return an empty dict.

        Returns:
            dict[str, str]
        """
        return self.add_method("attrib")  # type: ignore

    def keys(self) -> SpecialMethodsProtocol:  # type: ignore
        """Get attrib keys

        Returns:
            dict keys
        Raises:
            TypeError: if prev chain is not attrib
        """
        if self._is_attrib():
            return self.add_method("keys")  # type: ignore

    def values(self) -> SpecialMethodsProtocol:  # type: ignore
        """Get attrib values

        Returns:
            dict_values
        Raises:
            TypeError: if prev chain is not attrib
        """
        if self._is_attrib():
            return self.add_method("values")  # type: ignore

    def items(self) -> SpecialMethodsProtocol:  # type: ignore
        """Get attrib items

        Returns:
            dict items
        Raises:
            TypeError: if prev chain is not attrib
        """
        if self._is_attrib():
            return self.add_method("items")  # type: ignore

attrib property

Return the attributes dictionary for the first element.

If the list is empty, return an empty dict.

Returns:

Type Description
AttribProtocol

dict[str, str]

raw_text property

Shortcut Parsel().xpath('//p/text()') call.

This method for getting raw text (not html), when calling parsel.Selector methods will raise an error

__init__(auto_type=True, default=..., *, raw=False, alias=None)

Base field provide Parsel.Selector API and special methods

    Args:
        auto_type: usage auto type feature in BaseSchema scope. Default True
        default: set default value, if method return traceback.

Disable auto type andIf not set - raise error raw: raw text parse mode. Auto accept .xpath("//p/text()").get() method alias: field alias. default None

Source code in scrape_schema/field.py
def __init__(
    self,
    auto_type: bool = True,
    default: Any = ...,
    *,
    raw: bool = False,
    alias: Optional[str] = None,
) -> None:
    """Base field provide Parsel.Selector API and special methods

            Args:
                auto_type: usage auto type feature in BaseSchema scope. Default True
                default: set default value, if method return traceback.
    Disable auto type andIf not set - raise error
                raw: raw text parse mode. Auto accept `.xpath("//p/text()").get()` method
                alias: field alias. default None
    """
    super().__init__(auto_type=auto_type, default=default, alias=alias)
    if raw:
        self.xpath("//p/text()").get()

css(query)

Apply the given CSS selector and return a SelectorList instance.

query is a string containing the CSS selector to apply.

In the background, CSS queries are translated into XPath queries using cssselect library and run .xpath() method.

Source code in scrape_schema/field.py
def css(self, query: str) -> Self:
    """Apply the given CSS selector and return a SelectorList instance.

    query is a string containing the CSS selector to apply.

    In the background, CSS queries are translated into XPath queries using cssselect
    library and run .xpath() method.
    """
    return self.add_method("css", query)

get(default=None, key=None)

Serialize and return the matched nodes in a single string.

Percent encoded content is unquoted.

If key param passed - get value from attrib property.

Parameters:

Name Type Description Default
default Optional[str]

invoke Selector.get() method and return default value if is None

None
key Optional[Hashable]

get value from attrib property attrib should be called in chain methods

None

Raises:

Type Description
TypeError

if passed default and key arguments

Source code in scrape_schema/field.py
def get(
    self, default: Optional[str] = None, key: Optional[Hashable] = None
) -> SpecialMethodsProtocol:  # type: ignore
    """Serialize and return the matched nodes in a single string.

    Percent encoded content is unquoted.

    If `key` param passed - get value from attrib property.

    Args:
        default: invoke Selector.get() method and return default value if is None
        key: get value from attrib property *attrib should be called in chain methods*

    Raises:
        TypeError: if passed default and key arguments
    """
    if key and default:
        _logger.error(
            "get should be accept `key` OR `default` param, not `key` AND `default`"
        )
        raise TypeError(
            "get should be accept `key` OR `default` param, not `key` AND `default`"
        )
    elif key:
        if self._is_attrib():
            return self.add_method("get", key)  # type: ignore
    return self.add_method("get", default)  # type: ignore

getall()

Call the .get() method for each element is this list and return their results flattened, as a list of strings.

Returns:

Type Description
SpecialMethodsProtocol

list[str]

Source code in scrape_schema/field.py
def getall(self) -> SpecialMethodsProtocol:
    """Call the .get() method for each element is this list
    and return their results flattened, as a list of strings.

    Returns:
        list[str]
    """
    return self.add_method("getall")  # type: ignore

items()

Get attrib items

Returns:

Type Description
SpecialMethodsProtocol

dict items

Raises: TypeError: if prev chain is not attrib

Source code in scrape_schema/field.py
def items(self) -> SpecialMethodsProtocol:  # type: ignore
    """Get attrib items

    Returns:
        dict items
    Raises:
        TypeError: if prev chain is not attrib
    """
    if self._is_attrib():
        return self.add_method("items")  # type: ignore

jmespath(query, **kwargs)

Find objects matching the JMESPath query and return the result as a SelectorList instance with all elements flattened. List elements implement Selector interface too.

Parameters:

Name Type Description Default
query str

JMESPath string query

required
**kwargs Any

Any additional named arguments are passed to the underlying

{}
Source code in scrape_schema/field.py
def jmespath(self, query: str, **kwargs: Any) -> Self:
    """Find objects matching the JMESPath ``query`` and return the result as a
    SelectorList instance with all elements flattened.
    List elements implement `Selector` interface too.

    Args:
        query: JMESPath string query
        **kwargs:  Any additional named arguments are passed to the underlying
    """
    return self.add_method("jmespath", query, **kwargs)

keys()

Get attrib keys

Returns:

Type Description
SpecialMethodsProtocol

dict keys

Raises: TypeError: if prev chain is not attrib

Source code in scrape_schema/field.py
def keys(self) -> SpecialMethodsProtocol:  # type: ignore
    """Get attrib keys

    Returns:
        dict keys
    Raises:
        TypeError: if prev chain is not attrib
    """
    if self._is_attrib():
        return self.add_method("keys")  # type: ignore

re(regex, replace_entities=True)

Apply the given regex and return a list of strings with the matches.

Parameters:

Name Type Description Default
regex Union[str, Pattern[str]]

regular expression

required
replace_entities bool

replace char entity refers replaced by their corresponding char (&, <).

True

Returns:

Type Description
SpecialMethodsProtocol

string

Source code in scrape_schema/field.py
def re(
    self, regex: Union[str, Pattern[str]], replace_entities: bool = True
) -> SpecialMethodsProtocol:
    """
    Apply the given regex and return a list of strings with the
    matches.

    Args:
        regex: regular expression
        replace_entities: replace char entity refers replaced by their corresponding char (``&``, ``<``).

    Returns:
        string
    """
    return self.add_method("re", regex, replace_entities)  # type: ignore

values()

Get attrib values

Returns:

Type Description
SpecialMethodsProtocol

dict_values

Raises: TypeError: if prev chain is not attrib

Source code in scrape_schema/field.py
def values(self) -> SpecialMethodsProtocol:  # type: ignore
    """Get attrib values

    Returns:
        dict_values
    Raises:
        TypeError: if prev chain is not attrib
    """
    if self._is_attrib():
        return self.add_method("values")  # type: ignore

xpath(query, namespaces=None, **kwargs)

Xpath selector Args: query: is a string containing the XPATH query to apply. namespaces: is an optional prefix: namespace-uri mapping (dict) for additional prefixes. **kwargs: field alias. default None

Returns:

Type Description
Self

SelectorList

Source code in scrape_schema/field.py
def xpath(
    self, query: str, namespaces: Optional[Mapping[str, str]] = None, **kwargs: Any
) -> Self:
    """Xpath selector
    Args:
        query: is a string containing the XPATH query to apply.
        namespaces: is an optional ``prefix: namespace-uri`` mapping (dict) for additional prefixes.
        **kwargs: field alias. default None

    Returns:
        SelectorList

    """

    return self.add_method("xpath", query, namespaces, **kwargs)

Bases: Field

This field provide parsel.Selector api and special methods for json data

Source code in scrape_schema/field.py
class JMESPath(Field):
    """This field provide parsel.Selector api and special methods for json data"""

    def __init__(
        self, auto_type: bool = False, default: Any = ..., alias: Optional[str] = None
    ) -> None:
        """this field provide jmespath and special methods API

        Args:
            auto_type: usage auto_type feature. Default False
            default: default
            alias: field alias. default None
        """
        super().__init__(auto_type=auto_type, default=default, alias=alias)

    def jmespath(self, query: str, **kwargs: Any) -> Self:
        """Find objects matching the JMESPath ``query`` and return the result as a
        SelectorList instance with all elements flattened.
        List elements implement `Selector` interface too.

        Args:
            query: JMESPath string query
            **kwargs:  Any additional named arguments are passed to the underlying
        """
        return self.add_method("jmespath", query, **kwargs)

    def get(
        self, default: Optional[str] = None
    ) -> SpecialMethodsProtocol:  # type: ignore
        """Serialize and return the matched nodes in a single string.

        Percent encoded content is unquoted.

        If `key` param passed - get value from attrib property.

        Args:
            default: invoke Selector.get() method and return default value if is None

        Raises:
            TypeError: if passed default and key arguments
        """
        return self.add_method("get", default)  # type: ignore

    def getall(self) -> SpecialMethodsProtocol:
        """Call the .get() method for each element is this list
        and return their results flattened, as a list of strings.

        Returns:
            list[str]
        """

        return self.add_method("getall")  # type: ignore

__init__(auto_type=False, default=..., alias=None)

this field provide jmespath and special methods API

Parameters:

Name Type Description Default
auto_type bool

usage auto_type feature. Default False

False
default Any

default

...
alias Optional[str]

field alias. default None

None
Source code in scrape_schema/field.py
def __init__(
    self, auto_type: bool = False, default: Any = ..., alias: Optional[str] = None
) -> None:
    """this field provide jmespath and special methods API

    Args:
        auto_type: usage auto_type feature. Default False
        default: default
        alias: field alias. default None
    """
    super().__init__(auto_type=auto_type, default=default, alias=alias)

get(default=None)

Serialize and return the matched nodes in a single string.

Percent encoded content is unquoted.

If key param passed - get value from attrib property.

Parameters:

Name Type Description Default
default Optional[str]

invoke Selector.get() method and return default value if is None

None

Raises:

Type Description
TypeError

if passed default and key arguments

Source code in scrape_schema/field.py
def get(
    self, default: Optional[str] = None
) -> SpecialMethodsProtocol:  # type: ignore
    """Serialize and return the matched nodes in a single string.

    Percent encoded content is unquoted.

    If `key` param passed - get value from attrib property.

    Args:
        default: invoke Selector.get() method and return default value if is None

    Raises:
        TypeError: if passed default and key arguments
    """
    return self.add_method("get", default)  # type: ignore

getall()

Call the .get() method for each element is this list and return their results flattened, as a list of strings.

Returns:

Type Description
SpecialMethodsProtocol

list[str]

Source code in scrape_schema/field.py
def getall(self) -> SpecialMethodsProtocol:
    """Call the .get() method for each element is this list
    and return their results flattened, as a list of strings.

    Returns:
        list[str]
    """

    return self.add_method("getall")  # type: ignore

jmespath(query, **kwargs)

Find objects matching the JMESPath query and return the result as a SelectorList instance with all elements flattened. List elements implement Selector interface too.

Parameters:

Name Type Description Default
query str

JMESPath string query

required
**kwargs Any

Any additional named arguments are passed to the underlying

{}
Source code in scrape_schema/field.py
def jmespath(self, query: str, **kwargs: Any) -> Self:
    """Find objects matching the JMESPath ``query`` and return the result as a
    SelectorList instance with all elements flattened.
    List elements implement `Selector` interface too.

    Args:
        query: JMESPath string query
        **kwargs:  Any additional named arguments are passed to the underlying
    """
    return self.add_method("jmespath", query, **kwargs)

Bases: Field

This field provide special methods for raw text data (regex only)

Source code in scrape_schema/field.py
class Text(Field):
    """This field provide special methods for raw text data (regex only)"""

    def __init__(
        self, auto_type: bool = True, default: Any = ..., alias: Optional[str] = None
    ):
        """this field provide special methods API

        Args:
            auto_type: usage auto type feature in BaseSchema scope. Default True
            default: set default value, if method return traceback.
            alias: field alias. default None
        """
        super().__init__(auto_type=auto_type, default=default, alias=alias)
        # prepare get raw text
        self.add_method("xpath", "//body/p/text()")
        self.add_method("get")

__init__(auto_type=True, default=..., alias=None)

this field provide special methods API

Parameters:

Name Type Description Default
auto_type bool

usage auto type feature in BaseSchema scope. Default True

True
default Any

set default value, if method return traceback.

...
alias Optional[str]

field alias. default None

None
Source code in scrape_schema/field.py
def __init__(
    self, auto_type: bool = True, default: Any = ..., alias: Optional[str] = None
):
    """this field provide special methods API

    Args:
        auto_type: usage auto type feature in BaseSchema scope. Default True
        default: set default value, if method return traceback.
        alias: field alias. default None
    """
    super().__init__(auto_type=auto_type, default=default, alias=alias)
    # prepare get raw text
    self.add_method("xpath", "//body/p/text()")
    self.add_method("get")