Source code for libb.text

import base64
import contextlib
import logging
import quopri
import random
import string
import unicodedata

import regex as re

from libb._libb import collapse, sanitize_vulgar_string, uncamel
from libb._libb import underscore_to_camelcase

with contextlib.suppress(ImportError):
    import chardet

with contextlib.suppress(ImportError):
    import ftfy

logger = logging.getLogger(__name__)

__all__ = [
    'random_string',
    'fix_text',
    'underscore_to_camelcase',
    'uncamel',
    'strip_ascii',
    'sanitize_vulgar_string',
    'round_digit_string',
    'parse_number',
    'truncate',
    'rotate',
    'smart_base64',
    'strtobool',
    'fuzzy_search',
    'is_numeric',
]

#
# useful constants for writing unicode-based context-free grammars
#

# Unicode constants for context-free grammars
UNI_ALL = ''.join(chr(_) for _ in range(65536))
UNI_DECIMALS = ''.join(_ for _ in UNI_ALL if unicodedata.category(_) == 'Nd')
UNI_SLASHES = chr(47) + chr(8260) + chr(8725)
UNI_SUPERSCRIPTS = chr(8304) + chr(185) + chr(178) + chr(179) + ''.join(chr(_) for _ in range(8308, 8314))
UNI_SUBSCRIPTS = ''.join(chr(_) for _ in range(8320, 8330))
UNI_VULGAR_FRACTIONS = chr(188) + chr(189) + chr(190) + ''.join(chr(_) for _ in range(8531, 8543))

SUPERSCRIPT = dict(list(zip(UNI_SUPERSCRIPTS, list(range(10)))))
SUBSCRIPT = dict(list(zip(UNI_SUBSCRIPTS, list(range(10)))))

# Vulgar fraction mapping kept for backwards compatibility
_VULGAR_FRACTIONS = (
    1 / 4.0,
    2 / 4.0,
    3 / 4.0,
    1 / 3.0,
    2 / 3.0,
    1 / 5.0,
    2 / 5.0,
    3 / 5.0,
    4 / 5.0,
    1 / 6.0,
    5 / 6.0,
    1 / 8.0,
    3 / 8.0,
    5 / 8.0,
    7 / 8.0,
)
VULGAR_FRACTION = dict(list(zip(UNI_VULGAR_FRACTIONS, _VULGAR_FRACTIONS)))


[docs] def random_string(length): """Generate a random alphanumeric string. :param int length: Length of the string to generate. :returns: Random string of uppercase letters and digits. :rtype: str """ return ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(length))
[docs] def fix_text(text): r"""Use ftfy magic to fix text encoding issues. :param str text: Text to fix. :returns: Fixed text. :rtype: str Example:: >>> fix_text('âœ" No problems') # doctest: +SKIP '✔ No problems' >>> print(fix_text("¯\\_(ã\x83\x84)_/¯")) ¯\_(ツ)_/¯ >>> fix_text('Broken text… it’s flubberific!') "Broken text… it's flubberific!" >>> fix_text('LOUD NOISES') 'LOUD NOISES' """ return ftfy.fix_text(text)
# underscore_to_camelcase and uncamel are now implemented in Rust # See libb._libb for the implementations
[docs] def strip_ascii(s): """Remove non-ASCII characters from a string. :param str s: Input string. :returns: String with only ASCII characters. :rtype: str """ return s.encode('ascii', errors='ignore').decode()
# sanitize_vulgar_string is now implemented in Rust # See libb._libb for the implementation
[docs] def round_digit_string(s, places=None) -> str: """Round a numeric string to specified decimal places. :param str s: Numeric string to round. :param int places: Number of decimal places (None to preserve original). :returns: Rounded numeric string. :rtype: str Example:: >>> round_digit_string('7283.1234', 3) '7283.123' >>> round_digit_string('7283.1234', None) '7283.1234' >>> round_digit_string('7283', 3) '7283' """ s = s.strip() with contextlib.suppress(ValueError): f = float(s) i = int(f) if f == i: s = i elif places: s = round(f, places) else: s = f return str(s) return s
[docs] def parse_number(s: str, force=True): """Extract number from string. Handles various formats including commas, parentheses for negatives, and trailing characters. :param str s: String to parse. :param bool force: If True, return None on parse failure; if False, return original string. :returns: Parsed int or float, None, or original string (if force=False). Example:: >>> parse_number('1,200m') 1200 >>> parse_number('100.0') 100.0 >>> parse_number('100') 100 >>> parse_number('0.002k') 0.002 >>> parse_number('-1') -1 >>> parse_number('(1)') -1 >>> parse_number('-100.0') -100.0 >>> parse_number('(100.)') -100.0 >>> parse_number('') >>> parse_number('foo') >>> parse_number('foo', force=False) 'foo' """ if not s: return if s.endswith('.'): s+='0' if s.endswith('.)'): s = s[:-2]+'.0)' num = ''.join(re.findall(r'[\(-\d\.\)]+', s)) if not num and force: return if not num: return s if neg := re.match(r'^\((.*)\)$', num): num = '-'+neg.group(1) i = f = None with contextlib.suppress(Exception): i = int(num) with contextlib.suppress(Exception): f = float(num) if not force and (i is None and f is None): return s if i == f: return i return f
[docs] def truncate(s, width, suffix='...'): """Truncate a string to max width characters. Adds suffix if the string was truncated. Tries to break on whitespace. :param str s: String to truncate. :param int width: Maximum width including suffix. :param str suffix: Suffix to append when truncated. :returns: Truncated string. :rtype: str :raises AssertionError: If width is not longer than suffix. Example:: >>> truncate('fubarbaz', 6) 'fub...' >>> truncate('fubarbaz', 3) Traceback (most recent call last): ... AssertionError: Desired width must be longer than suffix >>> truncate('fubarbaz', 3, suffix='..') 'f..' """ assert width > len(suffix), 'Desired width must be longer than suffix' if len(s) <= width: return s w = width - len(suffix) # if the boundary is on a space, don't include it if s[w].isspace(): return s[:w] + suffix # break on the first whitespace from the end return s[:w].rsplit(None, 1)[0] + suffix
[docs] def rotate(s): """Apply rot13-like translation to string. Rotates characters including digits and punctuation. :param str s: String to rotate. :returns: Rotated string. :rtype: str Example:: >>> rotate("foobarbaz") ';^^-,{-,E' """ instr = string.ascii_lowercase + string.digits + string.punctuation + string.ascii_uppercase midpoint = len(instr) // 2 outstr = instr[midpoint:] + instr[:midpoint] return str.translate(s, str.maketrans(instr, outstr))
[docs] def smart_base64(encoded_words): r"""Decode base64 encoded words with intelligent charset handling. Splits out encoded words per RFC 2047, Section 2 and handles common encoding issues like multiline subjects and charset mismatches. :param str encoded_words: Base64 encoded string or plain text. :returns: Decoded string (or original if not encoded). :rtype: str .. note:: See `RFC 2047, Section 2 <http://tools.ietf.org/html/rfc2047#section-2>`_ Basic Usage:: >>> smart_base64('=?utf-8?B?U1RaOiBGNFExNSBwcmV2aWV3IOKAkyBUaGUgc3RhcnQgb2YgdGh' ... 'lIGNhc2ggcmV0dXJuIHN0b3J5PyBQYXRoIHRvICQyMDAgc3RvY2sgcHJpY2U/?=') 'STZ: F4Q15 preview – The start of the cash return story? Path to $200 stock price?' Multiline Subjects (common email bug - base64 encoded per line):: >>> smart_base64('=?UTF-8?B?JDEwTU0rIENJVCBHUk9VUCBUUkFERVMgLSBDSVQgNScyMiAxMDLi' ... 'hZ0tMTAz4oWbICBNSw==?=\r\n\t=?UTF-8?B?VA==?=') "$10MM+ CIT GROUP TRADES - CIT 5'22 102.625-103.125 MK T" Charset Mismatch (UTF-8 header with Latin-1 content):: >>> smart_base64('=?UTF-8?B?TVMgZW5lcmd5OiByaWcgMTdzIDkxwr4vOTLihZsgMThzIDkzwr4v' ... 'OTTihZsgMjBzIDgywg==?=\r\n\t=?UTF-8?B?vS84Mw==?=') 'MS energy: rig 17s 91.75/92.125 18s 93.75/94.125 20s 82.5/83' Unicode Characters:: >>> smart_base64('=?UTF-8?B?VGhpcyBpcyBhIGhvcnNleTog8J+Qjg==?=') 'This is a horsey: \U0001f40e' >>> smart_base64('=?UTF-8?B?U0xBQiAxIOKFnDogIDEwOSAtIMK9IHYgNzYuMjU=?=') 'SLAB 1.375: 109 - 0.5 v 76.25' Plain Text Passthrough:: >>> smart_base64('This is plain text') 'This is plain text' """ re_encoded = r'=\?{1}(.+)\?{1}([B|Q])\?{1}(.+)\?{1}=' if not re.search(re_encoded, encoded_words): return encoded_words def decode(charset, encoding, encoded_text): if encoding == 'B': fn = base64.urlsafe_b64decode if '-' in encoded_text or '\\' in encoded_text else base64.standard_b64decode byte_string = fn(encoded_text) elif encoding == 'Q': byte_string = quopri.decodestring(encoded_text) for chunk in byte_string.split(): if m := re.match(rb'(.*)\xc2$', chunk): chunk = m.groups()[0] # bad formatting try: yield chunk.decode(charset, 'strict') except UnicodeDecodeError: enc = chardet.detect(chunk)['encoding'] yield chunk.decode(enc or charset, 'replace') decoded = [] for c, e, t in re.findall(re_encoded, encoded_words): expand = list(collapse(list(decode(c, e, t)))) decoded.extend(expand) return sanitize_vulgar_string(' '.join(decoded))
[docs] def strtobool(val): """Convert a string representation of truth to boolean. True values are 'y', 'yes', 't', 'true', 'on', and '1'. False values are 'n', 'no', 'f', 'false', 'off', '0', and empty string. :param val: Value to convert (string, bool, or None). :returns: Boolean value. :rtype: bool :raises ValueError: If val is not a recognized truth value. """ if val is None: return False if isinstance(val, bool): return val if isinstance(val, str): val = val.lower() if val in {'y', 'yes', 't', 'true', 'on', '1'}: return True if val in {'', 'n', 'no', 'f', 'false', 'off', '0'}: return False raise ValueError(f'invalid truth value {val!r}')
[docs] def is_numeric(txt): """Check if value can be converted to a float. :param txt: Value to check. :returns: True if value can be converted to float. :rtype: bool .. warning:: Complex types cannot be converted to float. Example:: >>> is_numeric('a') False >>> is_numeric(1e4) True >>> is_numeric('1E2') True >>> is_numeric(complex(-1,0)) False """ try: float(txt) return True except (ValueError, TypeError): return False
if __name__ == '__main__': __import__('doctest').testmod(optionflags=4 | 8 | 32)