PKLNZneat_panda/__init__.py# -*- coding: utf-8 -*- """Top-level package for Neat Panda.""" __author__ = """Henric Sundberg""" __email__ = "henric.sundberg@gmail.com" __version__ = "0.7.1" from ._tidy import spread, gather from ._caretaker import clean_column_names, _clean_column_names PKpN y++neat_panda/_caretaker.py# -*- coding: utf-8 -*- import re from collections import Counter from typing import Union, Optional, List, Dict, Any import pandas as pd import pandas_flavor as pf @pf.register_dataframe_method def clean_column_names( object_: Union[List[Union[str, int]], pd.Index, pd.DataFrame], convert_duplicates: bool = True, convert_camel_case: bool = False, ) -> Union[List[str], pd.DataFrame]: """Clean messy column names. Inspired by the functions make_clean_names and clean_names from the R package janitor. Does not alter the original DataFrame. Parameters ---------- object_ : Union[List[Union[str, int]], pd.Index, pd.DataFrame]\n Messy columnnames in a list or as a pandas index or a dataframe with messy columnames convert_duplicates : bool, optional\n If True, unique columnnames are created. E.g. if there are two columns, country and Country, this option set the columnnames to country1 and country2. By default True convert_camel_case : bool, optional\n Converts camel case to snake case. E.g the columnname SubRegion is changed to sub_region. However, it only works for actual camel case names, like the example above. If instead the original columname where SUbRegion the resulting converted name would be s_ub_region. Hence, use this option with caution. By default False Returns ------- List[str] or a pandas DataFrame\n A list of cleaned columnames or a dataframe with cleaned columnames Raises ------ TypeError\n Raises TypeError if the passed object_ is not a list, pandas index or a pandas dataframe """ if isinstance(object_, (list, pd.Index)): columns = _clean_column_names_list( columns=object_, convert_duplicates=convert_duplicates, convert_camel_case=convert_camel_case, ) return columns elif isinstance(object_, pd.DataFrame): df = _clean_column_names_dataframe( df=object_, convert_duplicates=convert_duplicates, convert_camel_case=convert_camel_case, ) return df else: raise TypeError( f"The passed object_ is a {type(object_)}. It must be a list, pandas index or a pandas dataframe!" ) def _clean_column_names_list( columns: Union[List[Union[str, int]], pd.Index], convert_duplicates: bool = True, convert_camel_case: bool = False, ) -> List[str]: """Cleans messy columnames. Written to be a utility function. It is recommended to use the clean_columnames function instead. Regex that replace multiple spaces with one space i based on the user Nasir's answer at [StackOverflow](https://stackoverflow.com/questions/1546226/simple-way-to-remove-multiple-spaces-in-a-string) Regex that replace all non-alphanumeric characters in a string (except underscore) with underscore is based on the user psun's answer at [StackOverflow](https://stackoverflow.com/questions/12985456/replace-all-non-alphanumeric-characters-in-a-string/12985459) Parameters ---------- columns : Union[List[Union[str, int]], pd.Index]\n Messy columnames convert_duplicates : bool, optional\n If True, unique columnnames are created. E.g. if there are two columns, country and Country, this option set the columnnames to country1 and country2. By default True convert_camel_case : bool, optional\n Converts camel case to snake case. E.g the columnname SubRegion is changed to sub_region. However, it only works for actual camel case names, like the example above. If instead the original columname where SUbRegion the resulting converted name would be s_ub_region. Hence, use this option with caution. By default False Returns ------- List[str]\n Cleaned columnnames """ columns = _clean_column_names( columns=columns, convert_duplicates=convert_duplicates, convert_camel_case=convert_camel_case, expressions=[ r"column.lower()", # set columnnames to lowercase r're.sub(r"\s+", " ", column).strip()', # replace multiple spaces with one space r're.sub(r"\W+", "_", column).strip()', # replace all non-alphanumeric characters in a string (except underscore) with underscore r'column.rstrip("_").lstrip("_")', # remove leading and lagging underscores ], ) return columns def _clean_column_names_dataframe( df: pd.DataFrame, convert_duplicates: bool = True, convert_camel_case: bool = False ) -> pd.DataFrame: """Cleans messy columnames of a dataframe. Written to be a utility function. It is recommended to use the clean_columnames function instead. Does not alter the original DataFrame. Parameters ---------- df : pd.DataFrame\n A dataframe with messy columnnames convert_duplicates : bool, optional\n If True, unique columnnames are created. E.g. if there are two columns, country and Country, this option set the columnnames to country1 and country2. By default True convert_camel_case : bool, optional\n Converts camel case to snake case. E.g the columnname SubRegion is changed to sub_region. However, it only works for actual camel case names, like the example above. If instead the original columname where SUbRegion the resulting converted name would be s_ub_region. Hence, use this option with caution. By default False Returns ------- pd.DataFrame\n A dataframe with cleaned columnames Raises ------ TypeError\n If the df object is not a pandas dataframe TypeError is raised """ if not isinstance(df, pd.DataFrame): raise TypeError( f"The passed df is a {type(df)}. It must be a pandas dataframe!" ) df.columns = _clean_column_names_list( columns=df.columns, convert_duplicates=convert_duplicates, convert_camel_case=convert_camel_case, ) return df def _clean_column_names( columns: Union[List[Union[str, int]], pd.Index], custom: Dict[Any, Any] = None, expressions: List[str] = None, convert_duplicates: bool = True, convert_camel_case: bool = False, ) -> List[str]: """Base function for clean_columnames. Can be used for very specific needs. ---------- columns : Union[List[Union[str, int]], pd.Index]\n Messy columnnames custom : Dict[Any, Any], optional\n If you want to replace one character with another this option can be used. E.g if you want exclamationpoint to be replaced with dollarsign, pass the following: /{'!':'$'/}. Use with caution if the expression parameter is used since the expression parameter is evaluated after the custom parameter. By default None expressions : List[str], optional\n In this parameter any string method or regex can be passed. The must be passed as a string with column as object. E.g if you want, as in the example with in the custom parameter, wants to exclamationpoint to be replaced with dollarsign, pass the following: ["column.replace('!', '$')"] or you want capitalize the columns: ["column.capitalize()"] or you want to replace multiple spaces with one space: [r're.sub(r"\s+", " ", column).strip()'] # noqa: W605 or if you want to do all of the above: ['column.replace("!", "$")', 'column.capitalize()', r're.sub(r"\s+", " ", column).strip()' # noqa: W605 ] By default None convert_duplicates : bool, optional\n If True, unique columnnames are created. E.g. if there are two columns, country and Country, this option set the columnnames to country1 and country2. By default True convert_camel_case : bool, optional\n Converts camel case to snake case. E.g the columnname SubRegion is changed to sub_region. However, it only works for actual camel case names, like the example above. If instead the original columname where SUbRegion the resulting converted name would be s_ub_region. Hence, use this option with caution. By default False Returns ------- List[str]\n Clean columnnames Raises ------ TypeError\n If passed column object is not a list or a pandas index TypeError is raised """ if not isinstance(columns, (list, pd.Index)): raise TypeError( f"The passed columns is a {type(columns)}. It must be a list or a pandas index!" ) if type(columns) == pd.Index: columns = columns.to_list() # type: ignore columns = [str(column) for column in columns] if custom: for i, j in custom.items(): columns = [k.replace(i, j) for k in columns] if convert_camel_case: columns = _camel_to_snake(columns=columns) if expressions: for reg in expressions: columns = [ eval(reg, {}, {"column": column, "re": re}) for column in columns ] if convert_duplicates: columns = _convert_duplicates(columns=columns) return columns def _camel_to_snake(columns: List[str]) -> List[str]: """Converts a list of strings with camel case formatting to a list of strings with snake case formatting Code is based on code from [StackOverflow](https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case) Parameters ---------- columns : List[str] A list of strings with camel case formatting Returns ------- List A list of strings with snake case formatting Example ------- ```python a = ["CountryName", "SubRegion"] b = _camel_to_snake(columns=a) print(b) ["country_name", "sub_region"] ``` """ _cols = [] for i in columns: i = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", i) i = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", i).lower().replace("__", "_") _cols.append(i) return _cols def _convert_duplicates(columns: List[str]) -> List[str]: """Adds progressive numbers to a list of duplicate strings. Ignores non-duplicates. Function is based on code from [StackOverflow](https://stackoverflow.com/questions/30650474/python-rename-duplicates-in-list-with-progressive-numbers-without-sorting-list/30651843#30651843) Parameters ---------- columns : List[str]\n A list of strings Returns ------- List[str]\n A list of strings with progressive numbers added to duplicates. Example ------- ```python a = ["country_name", "sub_region", "country_name"]\n b = _convert_duplicates(columns=a)\n print(b) ["country_name1", "sub_region", "country_name2"] ``` """ d: Dict[str, List] = { a: list(range(1, b + 1)) if b > 1 else [] for a, b in Counter(columns).items() } columns = [i + str(d[i].pop(0)) if len(d[i]) else i for i in columns] return columns if __name__ == "__main__": pass PKhtNŖ;]]neat_panda/_helpers.pyimport pandas as pd from warnings import warn def _control_types( _df, _key, _value, _fill="NaN", _convert=False, _sep=None, _columns=[], _drop_na=False, _invert_columns=False, ): # spread and gather if not isinstance(_df, pd.DataFrame): raise TypeError("write something") if not isinstance(_key, str): raise TypeError() if not isinstance(_value, str): raise TypeError() # spread if isinstance(_fill, bool): raise TypeError() if not isinstance(_fill, (str, float, int)): raise TypeError() if not isinstance(_convert, bool): raise TypeError() if not isinstance(_sep, (str, type(None))): raise TypeError() # gather if not isinstance(_columns, (list, range)): raise TypeError() if isinstance(_columns, range) and len(_df.columns) - 1 < _columns[-1]: raise IndexError() if not isinstance(_drop_na, bool): raise TypeError() if not isinstance(_invert_columns, bool): raise TypeError() def _assure_consistent_value_dtypes(new_df, old_df, columns, value): """ """ _dtype = old_df[value].dtypes _error_columns = [] for col in columns: try: new_df[col] = new_df[col].astype(_dtype) except ValueError: new_df[col] = new_df[col].astype("O") _error_columns.append(col) continue if _error_columns: warn( UserWarning( f"""Atleast one NaN is generated in the following columns: {", ".join(_error_columns)}. Hence, the type of these columns is set to Object.""" ) ) return new_df def _custom_columns(columns, new_columns, key, sep): _cols = [i for i in columns if i not in new_columns] _custom = [key + sep + i for i in new_columns] return _cols + _custom PKgJN/Lneat_panda/_tidy.py# -*- coding: utf-8 -*- from typing import Union, Optional, List from ._helpers import _control_types, _assure_consistent_value_dtypes, _custom_columns import pandas as pd import pandas_flavor as pf @pf.register_dataframe_method def spread( df: pd.DataFrame, key: str, value: str, fill: Union[str, int, float] = "NaN", convert: bool = False, drop: bool = False, sep: Optional[str] = None, ) -> pd.DataFrame: """Spread a key-value pair across multiple columns. Behaves similar to the tidyr spread function.\n Does not work with multi index dataframes. Syntactic sugar for the pandas pivot method. Parameters ---------- df : pd.DataFrame\n A DataFrame key : str\n Column to use to make new frame’s columns value : str\n Column which contains values corresponding to the new frame’s columns fill : Union[str, int, float], optional\n Missing values will be replaced with this value.\n (the default is "NaN", which is numpy.nan) convert : bool, optional\n If True, the function tries to set the new columns datatypes to the original frame's value column datatype. However, if fill is equal to "NaN", all columns with a 'filled' value is set to the object type since Numpy.nan is of that type\n (the default is False, which ...) drop : bool, optional\n If True, all rows that contains at least one "NaN" is dropped. (the default is False) sep : Optional[str], optional\n If set, the names of the new columns will be given by "".\n E.g. if set to '-' and the key column is called 'Year' and contains 2018 and 2019 the new columns will be\n 'Year-2018' and 'Year-2019'. (the default is None, and using previous example, the new column names will be '2018' and '2019') Returns ------- pd.DataFrame\n A widened dataframe Example ------- ```python from neat_panda import spread from gapminder import gapminder gapminder2 = gapminder[["country", "continent", "year", "pop"]] gapminder3 = spread(df=gapminder2, key="year", value="pop") # or gapminder3 = gapminder2.pipe(spread, key="year", value="pop") print(gapminder3) country continent 1952 1957 1962 ... 0 Afghanistan Asia 8425333 9240934 10267083 ... 1 Albania Europe 1282697 1476505 1728137 ... 2 Algeria Africa 9279525 10270856 11000948 ... 3 Angola Africa 4232095 4561361 4826015 ... 4 Argentina Americas 17876956 19610538 21283783 ... . ... ... ... ... ... ... ```python """ _control_types( _df=df, _key=key, _value=value, _fill=fill, _convert=convert, _sep=sep ) _drop = [key, value] _columns = [i for i in df.columns.tolist() if i not in _drop] _df = df.set_index(_columns).pivot(columns=key) _df.columns = _df.columns.droplevel() new_df = pd.DataFrame(_df.to_records()) _new_columns = [i for i in new_df.columns if i not in df.columns] if sep: custom_columns = _custom_columns( new_df.columns.to_list(), _new_columns, key, sep ) new_df.columns = custom_columns _new_columns = [i for i in new_df.columns if i not in df.columns] if fill != "NaN": new_df[_new_columns] = new_df[_new_columns].fillna(fill) if drop: new_df = new_df.dropna(how="any") if convert: new_df = _assure_consistent_value_dtypes(new_df, df, _new_columns, value) return new_df @pf.register_dataframe_method def gather( df: pd.DataFrame, key: str, value: str, columns: Union[List[str], range], drop_na: bool = False, convert: bool = False, invert_columns: bool = False, ) -> pd.DataFrame: """Collapses/unpivots multiple columns into two columns, one with the key and one with the value. Behaves similir to the tidyr function gather. Parameters ---------- df : pd.DataFrame\n An untidy dataframe key : str\n Name of the new key column value : str\n Name of the new value column columns : Union[List[str], range]\n If invert_columns is set to False, as per default, the columns to unpivot. If invert columns is set to True, the columns NOT to pivot. Columns should be given as a list of string or a range of columns indexes. drop_na : bool, optional\n If True, all rows that contains at least one "NaN" is dropped. (the default is False) convert : bool, optional If True, the function uses infer_objects to set datatype (the default is False) invert_columns : bool, optional\n Should be used in conjunction with columns. If set to True, the columns set will be switched to the ones not present in the list (range). (the default is False) Returns ------- pd.DataFrame\n A tidy gathered dataframe Example ------- ```python from neat_panda import gather from gapminder import gapminder gapminder2 = gapminder[["country", "continent", "year", "pop"]] gapminder3 = spread(df=gapminder2, key="year", value="pop") gapminder4 = gather(gapminder3, key="year", value="pop", columns=range(2, 13)) # or gapminder4 = gather(gapminder3, key="year", value="pop", columns=range(0, 2), invert_columns=True) # or years = ["1952", "1957", "1962", "1967", "1972", "1977", "1982", "1987", "1992", "1997", "2002", "2007"] gapminder4 = gather(gapminder3, key="year", value="pop", columns=years) # or gapminder4 = gather(gapminder3, key="year", value="pop", columns=["country", "continent"], invert_columns=True) print(gapminder4) country continent year pop 0 Afghanistan Asia 1952 8425333 1 Albania Europe 1952 1282697 2 Algeria Africa 1952 9279525 3 Angola Africa 1952 4232095 4 Argentina Americas 1952 17876956 . ... ... ... ... ``` """ _control_types( _df=df, _key=key, _value=value, _columns=columns, _drop_na=drop_na, _convert=convert, _invert_columns=invert_columns, ) _all_columns = df.columns.to_list() if isinstance(columns, range): _temp_col = [] _index = list(columns) for i, j in enumerate(_all_columns): if i in _index: _temp_col.append(j) columns = _temp_col if invert_columns: columns = [i for i in _all_columns if i not in columns] _id_vars = [i for i in _all_columns if i not in columns] new_df = pd.melt( frame=df, id_vars=_id_vars, value_vars=columns, value_name=value, var_name=key ) if drop_na: new_df = new_df.dropna(how="all", subset=[value]) if convert: _dtype = new_df[value].infer_objects().dtypes new_df[value] = new_df[value].astype(_dtype) return new_df if __name__ == "__main__": pass PK(SqN$22"neat_panda-0.7.1.dist-info/LICENSEMIT License Copyright (c) 2019, Henric Sundberg Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!HPO neat_panda-0.7.1.dist-info/WHEEL HM K-*ϳR03rOK-J,/RH,szd&Y)r$[)T&UrPK!Hmr+#neat_panda-0.7.1.dist-info/METADATAWms۸_u#Q$ZΩyn!XS$  $(J]v_XMZ?YY1m 7-j EQpe5=ˡMlJhiTؼ•u%N4ڌb[WmsBvƪ%?X."Y3*h#tj c*rwq&#8[:oa"YUcPeQ_>Tq~]R0^Tl1t v 5'/G櫼=6a $cE̤|骸Ƈ*^ޣ)y/3-FuiG^vYv #Z:o2Q@{~i|P3Q0r 9<6MQ06z{%V.aG24?]UUv}aKMX3+rqq?DSܧS.ͪ,yki_,OTihӄݛ g]ŷbqCL,c!Zy洈Moإ 2MɊL}_>o:v9n\V߇Qe91c27,˴,bɚ&/]VUxASt?ag[!+fz 1D,6M3g4AJ-〦7q7IpeOE{"^# (;)Vh-_|,y?DsSE;znΉ)Q,r ?z&wlt&ƥ_ G >Aw;rX>w))c! ƓgH2(~.g啿@R&oޫl1muՑ}[ܱOBvx;&&p|W7m hŦٮ*yQ񬨏ŬK|Ը.cdfgRFahMr B 6=<_́8s#">B'@IdIJe$ lw#o*2W'0E+]Ϸ#r11< D< ͷ}<(\ ,L~As%YDiAXLAn1鄐_-sH'z ˃`Е 0!xh<.1;`#i'86k94TuW 0 ?A `,d Lx{ ꎮ@CDEFGHIJKLMASj`'M((M))M**M++M,ͬZwhP=44444444444UDmC44444444iiҹwONNO3OSOsOOOOOp]W5_ʎ`c'_O/"[~ݮ{]o{gخD]*Kw}5цlyw=u!oxCߨ^`l픚zv_JM=$.=L_f