{ "info": { "author": "Danilo Silva de Oliveira", "author_email": "danilooliveira28@hotmail.com", "bugtrack_url": null, "classifiers": [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Information Analysis" ], "description": "# dataframe_column_identifier\n## `latest version: 0.0.5`\n\n## What is this?\nA light and useful package to find columns in a Dataframe by its values.\n\n## Installing\n```\npip install dataframe-column-identifier==0.0.5\n```\n\n## Importing\n```\nfrom dataframe_column_identifier import DataFrameColumnIdentifier\n```\n\n## KBest - Feature Selection Using Example\n```\nimport pandas as pd\nfrom sklearn.feature_selection import SelectKBest, mutual_info_regression\nfrom dataframe_column_identifier import DataFrameColumnIdentifier\n\nprint(X_train.shape)\n(1460, 282)\n\nprint(X_test.shape)\n(1459, 282)\n\ndfci = DataFrameColumnIdentifier()\nkbest = SelectKBest(score_func=mutual_info_regression, k=10)\nkbest.fit_transform(X_train, y_train)\nkbest_get_support_output = kbest.get_support()\n\nprint(kbest_get_support_output)\narray([False, True, False, True, False, True, False, True, True,\n False, False, True, False, False, False, False, False, False,\n True, True, True, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, True, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False, False, False, False, False, False, False,\n False, False, False])\n\nprint(dfci.select_columns_KBest(X_train, kbest_get_support_output, verbose=1))\n[\n '1stFlrSF',\n 'ExterQual_TA',\n 'GarageArea',\n 'GarageCars',\n 'GarageYrBlt',\n 'GrLivArea',\n 'MSSubClass',\n 'OverallQual',\n 'TotalBsmtSF',\n 'YearBuilt'\n]\n\nX_train = dfci.transform(X_train)\nX_test = dfci.transform(X_test)\n\nprint(X_train.shape)\n(1460, 10)\n\nprint(X_test.shape)\n(1459, 10)\n\nprint(X_train.head(10))\n 1stFlrSF ExterQual_TA GarageArea GarageCars GarageYrBlt GrLivArea MSSubClass OverallQual TotalBsmtSF YearBuilt\n0 856.0 0.0 548.0 2.0 2003.0 1710.0 60.0 7.0 856.0 2003.0\n1 1262.0 1.0 460.0 2.0 1976.0 1262.0 20.0 6.0 1262.0 1976.0\n2 920.0 0.0 608.0 2.0 2001.0 1786.0 60.0 7.0 920.0 2001.0\n3 961.0 1.0 642.0 3.0 1998.0 1717.0 70.0 7.0 756.0 1915.0\n4 1145.0 0.0 836.0 3.0 2000.0 2198.0 60.0 8.0 1145.0 2000.0\n5 796.0 1.0 480.0 2.0 1993.0 1362.0 50.0 5.0 796.0 1993.0\n6 1694.0 0.0 636.0 2.0 2004.0 1694.0 20.0 8.0 1686.0 2004.0\n7 1107.0 1.0 484.0 2.0 1973.0 2090.0 60.0 7.0 1107.0 1973.0\n8 1022.0 1.0 468.0 2.0 1931.0 1774.0 50.0 7.0 952.0 1931.0\n9 1077.0 1.0 205.0 1.0 1939.0 1077.0 190.0 5.0 991.0 1939.0\n\nprint(X_test.head(10))\n 1stFlrSF ExterQual_TA GarageArea GarageCars GarageYrBlt GrLivArea MSSubClass OverallQual TotalBsmtSF YearBuilt\n0 896.0 1.0 730.0 1.0 1961.0 896.0 20.0 5.0 882.0 1961.0\n1 1329.0 1.0 312.0 1.0 1958.0 1329.0 20.0 6.0 1329.0 1958.0\n2 928.0 1.0 482.0 2.0 1997.0 1629.0 60.0 5.0 928.0 1997.0\n3 926.0 1.0 470.0 2.0 1998.0 1604.0 60.0 6.0 926.0 1998.0\n4 1280.0 0.0 506.0 2.0 1992.0 1280.0 120.0 8.0 1280.0 1992.0\n5 763.0 1.0 440.0 2.0 1993.0 1655.0 60.0 6.0 763.0 1993.0\n6 1187.0 1.0 420.0 2.0 1992.0 1187.0 20.0 6.0 1168.0 1992.0\n7 789.0 1.0 393.0 2.0 1998.0 1465.0 60.0 6.0 789.0 1998.0\n8 1341.0 1.0 506.0 2.0 1990.0 1341.0 20.0 7.0 1300.0 1990.0\n9 882.0 1.0 525.0 2.0 1970.0 882.0 20.0 4.0 882.0 1970.0\n```\n\n## Feature Selection Using Example\n```\nimport pandas as pd\nfrom sklearn.feature_selection import SelectKBest, mutual_info_regression\nfrom dataframe_column_identifier import DataFrameColumnIdentifier\n\nprint(X_train.shape)\n(1460, 282)\n\nprint(X_test.shape)\n(1459, 282)\n\ndfci = DataFrameColumnIdentifier()\nkbest = SelectKBest(score_func=mutual_info_regression, k=10)\nkbest_selected_features = kbest.fit_transform(X_train, y_train)\n\nprint(kbest_selected_features.shape)\n(1460, 10)\n\nprint(pd.DataFrame(kbest_selected_features).head(10))\n 0 1 2 3 4 5 6 7 8 9\n 0 60.0 7.0 2003.0 856.0 856.0 1710.0 2003.0 2.0 548.0 0.0\n 1 20.0 6.0 1976.0 1262.0 1262.0 1262.0 1976.0 2.0 460.0 1.0\n 2 60.0 7.0 2001.0 920.0 920.0 1786.0 2001.0 2.0 608.0 0.0\n 3 70.0 7.0 1915.0 756.0 961.0 1717.0 1998.0 3.0 642.0 1.0\n 4 60.0 8.0 2000.0 1145.0 1145.0 2198.0 2000.0 3.0 836.0 0.0\n 5 50.0 5.0 1993.0 796.0 796.0 1362.0 1993.0 2.0 480.0 1.0\n 6 20.0 8.0 2004.0 1686.0 1694.0 1694.0 2004.0 2.0 636.0 0.0\n 7 60.0 7.0 1973.0 1107.0 1107.0 2090.0 1973.0 2.0 484.0 1.0\n 8 50.0 7.0 1931.0 952.0 1022.0 1774.0 1931.0 2.0 468.0 1.0\n 9 190.0 5.0 1939.0 991.0 1077.0 1077.0 1939.0 1.0 205.0 1.0\n\nprint(dfci.select_columns_by_values(X_train, kbest_selected_features, n_validation_rows=100, verbose=1))\n[\n '1stFlrSF',\n 'ExterQual_TA',\n 'GarageArea',\n 'GarageCars',\n 'GarageYrBlt',\n 'GrLivArea',\n 'MSSubClass',\n 'OverallQual',\n 'TotalBsmtSF',\n 'YearBuilt'\n]\n\nX_train = dfci.transform(X_train)\nX_test = dfci.transform(X_test)\n\nprint(X_train.shape)\n(1460, 10)\n\nprint(X_test.shape)\n(1459, 10)\n\nprint(X_train.head(10))\n 1stFlrSF ExterQual_TA GarageArea GarageCars GarageYrBlt GrLivArea MSSubClass OverallQual TotalBsmtSF YearBuilt\n0 856.0 0.0 548.0 2.0 2003.0 1710.0 60.0 7.0 856.0 2003.0\n1 1262.0 1.0 460.0 2.0 1976.0 1262.0 20.0 6.0 1262.0 1976.0\n2 920.0 0.0 608.0 2.0 2001.0 1786.0 60.0 7.0 920.0 2001.0\n3 961.0 1.0 642.0 3.0 1998.0 1717.0 70.0 7.0 756.0 1915.0\n4 1145.0 0.0 836.0 3.0 2000.0 2198.0 60.0 8.0 1145.0 2000.0\n5 796.0 1.0 480.0 2.0 1993.0 1362.0 50.0 5.0 796.0 1993.0\n6 1694.0 0.0 636.0 2.0 2004.0 1694.0 20.0 8.0 1686.0 2004.0\n7 1107.0 1.0 484.0 2.0 1973.0 2090.0 60.0 7.0 1107.0 1973.0\n8 1022.0 1.0 468.0 2.0 1931.0 1774.0 50.0 7.0 952.0 1931.0\n9 1077.0 1.0 205.0 1.0 1939.0 1077.0 190.0 5.0 991.0 1939.0\n\nprint(X_test.head(10))\n 1stFlrSF ExterQual_TA GarageArea GarageCars GarageYrBlt GrLivArea MSSubClass OverallQual TotalBsmtSF YearBuilt\n0 896.0 1.0 730.0 1.0 1961.0 896.0 20.0 5.0 882.0 1961.0\n1 1329.0 1.0 312.0 1.0 1958.0 1329.0 20.0 6.0 1329.0 1958.0\n2 928.0 1.0 482.0 2.0 1997.0 1629.0 60.0 5.0 928.0 1997.0\n3 926.0 1.0 470.0 2.0 1998.0 1604.0 60.0 6.0 926.0 1998.0\n4 1280.0 0.0 506.0 2.0 1992.0 1280.0 120.0 8.0 1280.0 1992.0\n5 763.0 1.0 440.0 2.0 1993.0 1655.0 60.0 6.0 763.0 1993.0\n6 1187.0 1.0 420.0 2.0 1992.0 1187.0 20.0 6.0 1168.0 1992.0\n7 789.0 1.0 393.0 2.0 1998.0 1465.0 60.0 6.0 789.0 1998.0\n8 1341.0 1.0 506.0 2.0 1990.0 1341.0 20.0 7.0 1300.0 1990.0\n9 882.0 1.0 525.0 2.0 1970.0 882.0 20.0 4.0 882.0 1970.0\n```\n\n## dataframe_column_identifier.DataFrameColumnIdentifier\n### Creating a new instance\n\n```dfci = DataFrameColumnIdentifier()```\n\n### Methods\n- select_columns_by_values :\n\n Returns the names of the Pandas DataFrame columns which are selected based on a matrix of values.\n\n ```dfci.select_columns_by_values(X, selected_values, n_validation_rows=100, verbose=1)```\n\n Parameters:\n\n - X : Pandas DataFrame\n\n A DataFrame with the columns that must be found (the DataFrame must have the columns' values either).\n\n - X_columns_values : numpy matrix \n\n The values of the columns to be found.\n\n - n_validation_rows : int, optional (default=1000)\n\n The number of rows that must be equal in the columns comparison. If the informed number is greater than the number of rows in X, the numberrows in X will be used.\n\n - verbose : int, optional (default=0)\n\n It controls the verbosity when looking for the columns.\n\n- select_columns_KBest :\n\n Returns the names of the Pandas DataFrame columns which are selected based on the KBest.get_support method's output.\n\n ```dfci.select_columns_KBest(X, kbest_get_support_output, verbose=1)```\n\n Parameters\n\n - X : Pandas DataFrame\n\n The same DataFrame used in the KBest.fit_transform method.\n\n - kbest_get_support_output : boolean array \n\n The KBest.get_support method's output.\n\n - verbose : int, optional (default=0)\n\n It controls the verbosity when looking for the columns.\n\n- transform : \n\n Returns a new Pandas DataFrame with only the columns which were selected on the select_columns_* method.\n\n ```dfci.transform(X)```\n\n Parameters:\n\n - X : Pandas DataFrame \n\n The DataFrame to be transformed (the Pandas DataFrame must have the columns that should be found).\n\n### Attributes\n- selected_columns_ : Name of the given Pandas DataFrame columns which were selected based on the given values, after the select_columns_* method execution.\n\n\n", "description_content_type": "text/markdown", "docs_url": null, "download_url": "", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://github.com/ds-oliveira/dataframe_column_identifier", "keywords": "", "license": "", "maintainer": "", "maintainer_email": "", "name": "dataframe-column-identifier", "package_url": "https://pypi.org/project/dataframe-column-identifier/", "platform": "", "project_url": "https://pypi.org/project/dataframe-column-identifier/", "project_urls": { "Homepage": "https://github.com/ds-oliveira/dataframe_column_identifier" }, "release_url": "https://pypi.org/project/dataframe-column-identifier/0.0.5/", "requires_dist": [ "pandas", "numpy" ], "requires_python": ">=3.6", "summary": "A light and useful package to find columns in a Dataframe by its values.", "version": "0.0.5" }, "last_serial": 5813151, "releases": { "0.0.1": [ { "comment_text": "", "digests": { "md5": "74a08d08783ae5e4f60b767ec1713ac0", "sha256": "ef730ff4264815505143ac245ebc8e647fb233245309aa6c92b6f949c31ded38" }, "downloads": -1, "filename": "dataframe_column_identifier-0.0.1-py3-none-any.whl", "has_sig": false, "md5_digest": "74a08d08783ae5e4f60b767ec1713ac0", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": ">=3.6", "size": 5114, "upload_time": "2019-09-07T14:57:29", "url": "https://files.pythonhosted.org/packages/78/54/e07a26558ebe0d6cd9f5c0b7ace03f99570e4610a9ac745f40ea05f71dc5/dataframe_column_identifier-0.0.1-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "1974ad1fef418876fffd5cec08b19212", "sha256": "66b22919bbc497688af464cdbac98d5de193dc95774b24f752a3312f0793ebb8" }, "downloads": -1, "filename": "dataframe-column-identifier-0.0.1.tar.gz", "has_sig": false, "md5_digest": "1974ad1fef418876fffd5cec08b19212", "packagetype": "sdist", "python_version": "source", "requires_python": ">=3.6", "size": 3893, "upload_time": "2019-09-07T14:57:31", "url": "https://files.pythonhosted.org/packages/ff/a9/7e402f9465025cbd1c8738fba7b188e372b7d0ecc233720aaae102ea073a/dataframe-column-identifier-0.0.1.tar.gz" } ], "0.0.2": [ { "comment_text": "", "digests": { "md5": "62eb9206e9ea982e908e43ce28ce8928", "sha256": "53436df6beb15d33ab2f1e6e3cac5fd14b94d94a1e97bc0e224dfbae24d72bc8" }, "downloads": -1, "filename": "dataframe_column_identifier-0.0.2-py3-none-any.whl", "has_sig": false, "md5_digest": "62eb9206e9ea982e908e43ce28ce8928", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": ">=3.6", "size": 5115, "upload_time": "2019-09-07T15:00:45", "url": "https://files.pythonhosted.org/packages/d9/72/82b8fbaba22da29d7666d12586e541341e85bae1185c42f0bfc51230e79b/dataframe_column_identifier-0.0.2-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "b1b6bde86a714641fadcda408d8c45d8", "sha256": "a89571020d8edfdbba27d4062b4940e1c06b6fbb222b639f10424cac22add319" }, "downloads": -1, "filename": "dataframe-column-identifier-0.0.2.tar.gz", "has_sig": false, "md5_digest": "b1b6bde86a714641fadcda408d8c45d8", "packagetype": "sdist", "python_version": "source", "requires_python": ">=3.6", "size": 3890, "upload_time": "2019-09-07T15:00:47", "url": "https://files.pythonhosted.org/packages/28/9e/40a6bb288c6fb2616e6ab0dbe406d92946d8d5b3c541bd7578b8c4b24e06/dataframe-column-identifier-0.0.2.tar.gz" } ], "0.0.3": [ { "comment_text": "", "digests": { "md5": "7a4f31c98e695de5bde6345089a96739", "sha256": "6fbd4d1a40d281b9612eee26a6016c020b167ce3170b323c758bbab465020c8c" }, "downloads": -1, "filename": "dataframe_column_identifier-0.0.3-py3-none-any.whl", "has_sig": false, "md5_digest": "7a4f31c98e695de5bde6345089a96739", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": ">=3.6", "size": 5528, "upload_time": "2019-09-07T16:25:06", "url": "https://files.pythonhosted.org/packages/cb/ae/2275b895320b255714236cd4fe583f4772c9f55ce66603542a9539bbebce/dataframe_column_identifier-0.0.3-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "712d32b5647fdd2967c40a99f77b4c82", "sha256": "50a5d05a1ba32c37428b3242f0452fbd59b096e61743476e1524262fe0d72e49" }, "downloads": -1, "filename": "dataframe-column-identifier-0.0.3.tar.gz", "has_sig": false, "md5_digest": "712d32b5647fdd2967c40a99f77b4c82", "packagetype": "sdist", "python_version": "source", "requires_python": ">=3.6", "size": 5047, "upload_time": "2019-09-07T16:25:08", "url": "https://files.pythonhosted.org/packages/a8/a6/d1996e970249036e0e8488e4326188037e48fdbae22aa39636b057e23479/dataframe-column-identifier-0.0.3.tar.gz" } ], "0.0.4": [ { "comment_text": "", "digests": { "md5": "7077dc337620ad502029218ae6c333fb", "sha256": "fa15c5aac731d371e5a884ab8bcfea0bd4ba3c2f7ddc0195f4b69c676e6f0fb2" }, "downloads": -1, "filename": "dataframe_column_identifier-0.0.4-py3-none-any.whl", "has_sig": false, "md5_digest": "7077dc337620ad502029218ae6c333fb", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": ">=3.6", "size": 5514, "upload_time": "2019-09-10T04:25:15", "url": "https://files.pythonhosted.org/packages/3b/5d/7c73d6fbca84003da3bb571c71ef686c052e83f47b34751770ea59234012/dataframe_column_identifier-0.0.4-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "c5e815046442fe59260340baa708e7ac", "sha256": "1a5b77cbc5e1b97893f4a526efcb8f356983318c57069659be7095c5420f76d1" }, "downloads": -1, "filename": "dataframe-column-identifier-0.0.4.tar.gz", "has_sig": false, "md5_digest": "c5e815046442fe59260340baa708e7ac", "packagetype": "sdist", "python_version": "source", "requires_python": ">=3.6", "size": 4873, "upload_time": "2019-09-10T04:25:17", "url": "https://files.pythonhosted.org/packages/f2/6c/2b99d4d5ae0e7946ed669da954ff8bc8decdbafa5e63b1241e775ab76d1c/dataframe-column-identifier-0.0.4.tar.gz" } ], "0.0.5": [ { "comment_text": "", "digests": { "md5": "34faeffd21dc12f1116be7eb13f07063", "sha256": "8b4e3deaaa15e3528cf714ced613b38a51a51ebfab15aa9321e679d6ecc3495e" }, "downloads": -1, "filename": "dataframe_column_identifier-0.0.5-py3-none-any.whl", "has_sig": false, "md5_digest": "34faeffd21dc12f1116be7eb13f07063", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": ">=3.6", "size": 8519, "upload_time": "2019-09-11T06:19:27", "url": "https://files.pythonhosted.org/packages/0e/1a/8380bf3ca87390f8693c741286bdd1c3418d414f6d9ad254bf193f501249/dataframe_column_identifier-0.0.5-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "49d9220d69e2c8598aaf70c5e673436d", "sha256": "86f782abcdef558b6d129cd56bd3f85a07d2a361c5ca3cd1b87466a8caf394ad" }, "downloads": -1, "filename": "dataframe-column-identifier-0.0.5.tar.gz", "has_sig": false, "md5_digest": "49d9220d69e2c8598aaf70c5e673436d", "packagetype": "sdist", "python_version": "source", "requires_python": ">=3.6", "size": 6091, "upload_time": "2019-09-11T06:19:28", "url": "https://files.pythonhosted.org/packages/58/93/2a65efab23870dded622c926771d32cf8b74444abd875632e8de2c6d9cc3/dataframe-column-identifier-0.0.5.tar.gz" } ] }, "urls": [ { "comment_text": "", "digests": { "md5": "34faeffd21dc12f1116be7eb13f07063", "sha256": "8b4e3deaaa15e3528cf714ced613b38a51a51ebfab15aa9321e679d6ecc3495e" }, "downloads": -1, "filename": "dataframe_column_identifier-0.0.5-py3-none-any.whl", "has_sig": false, "md5_digest": "34faeffd21dc12f1116be7eb13f07063", "packagetype": "bdist_wheel", "python_version": "py3", "requires_python": ">=3.6", "size": 8519, "upload_time": "2019-09-11T06:19:27", "url": "https://files.pythonhosted.org/packages/0e/1a/8380bf3ca87390f8693c741286bdd1c3418d414f6d9ad254bf193f501249/dataframe_column_identifier-0.0.5-py3-none-any.whl" }, { "comment_text": "", "digests": { "md5": "49d9220d69e2c8598aaf70c5e673436d", "sha256": "86f782abcdef558b6d129cd56bd3f85a07d2a361c5ca3cd1b87466a8caf394ad" }, "downloads": -1, "filename": "dataframe-column-identifier-0.0.5.tar.gz", "has_sig": false, "md5_digest": "49d9220d69e2c8598aaf70c5e673436d", "packagetype": "sdist", "python_version": "source", "requires_python": ">=3.6", "size": 6091, "upload_time": "2019-09-11T06:19:28", "url": "https://files.pythonhosted.org/packages/58/93/2a65efab23870dded622c926771d32cf8b74444abd875632e8de2c6d9cc3/dataframe-column-identifier-0.0.5.tar.gz" } ] }