Source code for libb.pandasutils

"""Pandas wrappers and utilities

This module provides utility functions for pandas DataFrames and Series,
including null checking, type downcasting, fuzzy merging, and timezone data.
"""
from __future__ import annotations

import gc
import gzip
import os
import shutil
import tarfile
import tempfile
from pathlib import Path

__all__ = ['is_null', 'download_tzdata', 'downcast', 'fuzzymerge']


[docs] def is_null(x): """Check if value is null/None (pandas required). For array-like inputs (list, numpy array), returns True only if ALL elements are null. This avoids the "ambiguous truth value" error that occurs when using pandas.isnull() on arrays in boolean contexts. :param x: Value to check. :returns: True if value is null/None/NaN, or if array-like and all elements are null. :rtype: bool Example:: >>> import datetime >>> import numpy as np >>> assert is_null(None) >>> assert not is_null(0) >>> assert is_null(np.nan) >>> assert not is_null(datetime.date(2000, 1, 1)) >>> assert is_null([]) >>> assert is_null([None, None]) >>> assert not is_null([1, 2, 3]) >>> assert not is_null([None, 1]) """ import numpy as np from pandas import isnull try: from pyarrow.lib import NullScalar if isinstance(x, NullScalar): return True except ImportError: pass if isinstance(x, np.ndarray): if x.size == 0: return True return all(is_null(v) for v in x.flat) if isinstance(x, list): if len(x) == 0: return True return all(is_null(v) for v in x) return isnull(x)
[docs] def download_tzdata(): """Download timezone data for pyarrow date wrangling. Downloads to the "Downloads" folder. """ from libb import download_file, expandabspath base = expandabspath('~/Downloads') / 'tzdata' base.mkdir(exist_ok=True) temppath = Path(tempfile.gettempdir()) tzgz = download_file( 'https://data.iana.org/time-zones/releases/tzdata2022f.tar.gz', temppath / 'tzdata2022f.tar.gz', ) with gzip.open(tzgz, 'rb') as fin: tztar = temppath / 'tzdata2022f.tar' with tztar.open('wb') as fout: shutil.copyfileobj(fin, fout) tarfile.open(tztar).extractall(base) zoneB = download_file( 'https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml', temppath / 'windowsZones.xml', ) shutil.copy(zoneB, base / 'windowsZones.xml')
[docs] def downcast(df, rtol=1e-05, atol=1e-08, numpy_dtypes_only=False): """Downcast DataFrame to minimum viable type for each column. Ensures resulting values are within tolerance of original values. :param DataFrame df: DataFrame to downcast. :param float rtol: Relative tolerance for numeric comparison. :param float atol: Absolute tolerance for numeric comparison. :param bool numpy_dtypes_only: Use only numpy dtypes. :returns: Downcasted DataFrame. :rtype: DataFrame .. note:: See `numpy.allclose <https://numpy.org/doc/stable/reference/generated/numpy.allclose.html>`_ for tolerance parameters. Example:: >>> from numpy import linspace, random >>> from pandas import DataFrame >>> data = { ... "integers": linspace(1, 100, 100), ... "floats": linspace(1, 1000, 100).round(2), ... "booleans": random.choice([1, 0], 100), ... "categories": random.choice(["foo", "bar", "baz"], 100)} >>> df = DataFrame(data) >>> downcast(df, rtol=1e-10, atol=1e-10).info() <class 'pandas.core.frame.DataFrame'> ... dtypes: bool(1), category(1), float64(1), uint8(1) memory usage: 1.3 KB >>> downcast(df, rtol=1e-05, atol=1e-08).info() <class 'pandas.core.frame.DataFrame'> ... dtypes: bool(1), category(1), float32(1), uint8(1) memory usage: 964.0 bytes """ import pdcast from pdcast import downcast as pdc_downcast pdcast.options.RTOL = rtol pdcast.options.ATOL = atol return pdc_downcast(df, numpy_dtypes_only=numpy_dtypes_only)
[docs] def fuzzymerge(df1, df2, right_on, left_on, usedtype='uint8', scorer='WRatio', concat_value=True, **kwargs): """Merge two DataFrames using fuzzy matching on specified columns. Performs fuzzy matching between DataFrames based on specified columns, useful for matching data with small variations like typos or abbreviations. :param DataFrame df1: First DataFrame to merge. :param DataFrame df2: Second DataFrame to merge. :param str right_on: Column name in df2 for matching. :param str left_on: Column name in df1 for matching. :param usedtype: Data type for distance matrix (default: uint8). :param scorer: Scoring function for fuzzy matching (default: WRatio). :param bool concat_value: Add similarity scores column (default: True). :param kwargs: Additional arguments for pandas.merge. :returns: Merged DataFrame with fuzzy-matched rows. :rtype: DataFrame Example:: >>> df1 = read_csv( # doctest: +SKIP ... "https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv" ... ) >>> df2 = df1.copy() # doctest: +SKIP >>> df2 = concat([df2 for x in range(3)], ignore_index=True) # doctest: +SKIP >>> df2.Name = (df2.Name + random.uniform(1, 2000, len(df2)).astype("U")) # doctest: +SKIP >>> df1 = concat([df1 for x in range(3)], ignore_index=True) # doctest: +SKIP >>> df1.Name = (df1.Name + random.uniform(1, 2000, len(df1)).astype("U")) # doctest: +SKIP >>> df3 = fuzzymerge(df1, df2, right_on='Name', left_on='Name', usedtype=uint8, scorer=partial_ratio, # doctest: +SKIP ... concat_value=True) """ from numexpr import evaluate from numpy import amax, tile, uint8 # noqa: F401 - uint8 used by eval from numpy import where from rapidfuzz.fuzz import partial_ratio # noqa: F401 - used by eval from rapidfuzz.process import cdist # Handle string type annotations if isinstance(usedtype, str): usedtype = eval(usedtype) if isinstance(scorer, str): scorer = eval(scorer) a = df1[right_on].__array__().astype('U') b = df2[left_on].__array__().astype('U') allcom = cdist( a, b, scorer=scorer, dtype=usedtype, workers=g if (g := os.cpu_count() - 1) > 1 else 1, ) max_values = amax(allcom, axis=1) df1index, df2index = where( evaluate( 'a==b', global_dict={}, local_dict={'a': allcom, 'b': tile(max_values.reshape((-1, 1)), (1, allcom.shape[1]))}, )) concatvalue = allcom[df1index, df2index].copy() del allcom gc.collect() kwargs['right_index'] = True kwargs['left_index'] = True toggi = df1.\ iloc[df1index]\ .reset_index(drop=False)\ .merge(df2 .iloc[df2index] .reset_index(drop=False), **kwargs) if concat_value: toggi['concat_value'] = concatvalue return toggi
if __name__ == '__main__': __import__('doctest').testmod(optionflags=4 | 8 | 32)