{ "info": { "author": "Jerry", "author_email": "jerryzhujian9@gmail.com", "bugtrack_url": null, "classifiers": [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 2.7", "Topic :: Software Development :: Build Tools" ], "description": "This module is for easy interaction with linux, Mac OS X, Windows shell.\n=============================================\njerryzhujian9_at_gmail.com\nTested under python 2.7\nTo see your python version\nin terminal: python -V\nor in python: import sys; print (sys.version)\n=============================================\nInstall:\nhttps://pypi.python.org/pypi/ez\npip install ez\n\nAlmost all commands support the usage of '~', '..', '.', '?', '*' in path (ls,fls only support regular expression).\nSymbolic link itself is the target of file operations; the actual file should be safe.\n\ndebug(1/0)\n # 0 = everything will be actually executed\n # 1 = simulate operations of cp, mv, execute; other commands will be actually performed.\n will print out simulated commands, useful for debugging and for counting files when necessary.\nerror(msg)\n\nfullpath(path)\npwd() or cwd() # Returns current working director.\ncsd(), csf() # Returns current script directory, i.e. the directory where the running script is.\nparentdir(path) # Returns the parent directory of a path.\njoinpath(path1[, path2[, ...]]) # Returns the joined path. Supports vectorization.\nsplitpath(path) # Returns a list of path elements: [path, file, ext]. Supports vectorization.\ncd(path) # Changes to a new working directory.\n\njoin(sep,string1,string2), join(sep,array) # Glues together strings with sep. Supports vectorization.\nsort(array)\nreplace(theList,theItem,replacement), remove(theList,theItem)\n\nls([path[, regex]], full=True) # Returns a list of all (including hidden) files with their full paths in path, filtered by regular expression.\nlsd([path[, regex]], full=True)\nfls([path[, regex]]) # Returns a list of files with their full paths in flattened path (i.e. walk each subdirectory).\n# the filter only works for short file name not for full file name, i.e. the file name itself not its full path\n# regular expression is case-sensitive\n# usage: ls(); ls(cwd()); ls(cwd(), \"\\.py$\")\n\nmkdir(\"path/to/a/directory\") # Makes a directory (also any one of the \"path\", \"to\", \"a\" directories if not exits).\nrn(old, new) # Renames old to new.\nexists(path) # Returns the existence of path (0 or 1).\nrm(path) # Deletes a file or folder. Supports wildcards, vectorization.\ncp(source, destination) # Copies source file(s) or folder to destination. Supports wildcards, vectorization.\nmv(source, destination) # Moves source file(s) or folder to destination. Supports wildcards, vectorization.\n\nexecute(cmd, output=True) # Executes a bash command with or without capturing shell output\nwith nooutput():\n print 'this is will not be printed in stdout'\npprint() # Pretty prints.\nbeep() # Beeps to notify user.\nwhich(name) # Prints where a module is and in which module a function is. which('python') returns which python is being used.\nhelp(name)/doc(name) # name is a string, Prints the doc string of a module/class/function\n when write a module, add:\n __doc__ = three double quotes blabla three double quotes <-----this is module's docstring, use explicit\n\n when write a function/class:\n def function(arg):\n three double quotes Returns, blabla three double quotes <-----this is function's doctoring, use implicit\n return sth\nver(package_name) version(package_name), see a package's version. package_name could be 'python'\nwhos(name),whos() list imported functions/packages\n\nlog(file=\"log.txt\", mode='a', status=True)\n status=True (default) Prints output to both terminal and a file (log.txt, default name) globally.\n status=False Prints output only to terminal\n mode: a=append; w=overwrite\n Note: use this function carefully, because it changes the sys.stdout globally.\n\ntree([path[, forest=True]) # Prints a directory tree structure. \n forest=True (default) prints only folders, i.e., print less to show the big forest\n forest=False prints files plus folders\n\n[starts, ends] = regexp(string, pattern); regexp(string, pattern, method='split/match'), regexpi\nregexprep(string, pattern, replace, count=0), regexprepi\n\nsprintf(formatString, *args)\niff(expression, result1, result2)\nclear(module, recursive=False)\n\nnum(string)\nisempty(s)\nRandomize(x), randomize(x) # Sets a randomization seed.\nRandomizeArray(list=[]) randomizearray(list=[]) # Shuffles a list in place.\nRandom(a,b) random(a,b) # Returns a random integer N such that a <= N <= b.\nRandomChoice(seq), randomchoice(seq) # Returns a random element from sequence\nPermute(iterable=[]) permute(iterable=[]) # Returns permutations in a list\n\nunique(seq), union(seq1,seq2), intersect(seq1,seq2), setdiff(seq1,seq2) in original order\n note: setdiff(seq1,seq2) may not be equal to setdiff(seq2,seq1)\n >>> unique('abracadaba')\n ['a', 'b', 'r', 'c', 'd']\n >>> unique('simsalabim')\n ['s', 'i', 'm', 'a', 'l', 'b']\n >>>\n >>> setdiff('abracadaba','simsalabim')\n ['r', 'c', 'd']\n >>> setdiff('simsalabim','abracadaba')\n ['s', 'i', 'm', 'l']\nduplicate(seq) # returns a list of duplicated elements in original order\n\nJDict() # Jerry's dictionary, customized ordered dictionary class with convient attributes and methods, see help(JDict)\nMoment(timezone) # Generates the current datetime in specified timezone, or local naive datetime if omitted.\n\nSetClip(content), setclip(content) # Copy/Write something to current clipboard\ncontent = GetClip(), content = getclip() # Read out content from current clipboard and assign to a variable\n\nlines(path='.', pattern='\\.py$|.ini$|\\.c$|\\.h$|\\.m$', recursive=True) # Counts lines of codes, counting empty lines as well.\nkeygen(length=8, complexity=3) # generate a random key\nhashes(filename): # Calculate/Print a file's md5 32; sha1 32; can handle big files in a memory efficient way\n\nisemailvalid(email) # True or False, isEmailValid, IsEmailValid\nexport(input,output,options,**kwargs): # Convert url, file (html, txt), string to a single pdf\n\n\n\n\n\nTo avoid typing email password each time, place a file named pygmailconfig.py with\nEMAIL = 'someone@gmail.com'\nPASSWORD = 'abcdefghik'\nin the site-packages/ez folder\nThe functions will no longer need email/password and become like this\nMail(to, subject, body, attach=None), AddEvent(event), Sheet(fileName)\n\nMail([EMAIL, PASSWORD, ] to, subject, body, attachment=None, bcc=None, cc=None, reply_to=None)\n to/bcc/cc: ['a@a.com','b@b.com'] or 'a@a.com, b@b.com'\n reply_to: 'a@a.com'\n attachment: 'file_in_working_dir.txt' or ['a.txt','b.py','c.pdf']\nAddEvent([EMAIL, PASSWORD, ] event) on DATE at TIME for DURATION in PLACE\n\nSheet([EMAIL, PASSWORD, ] fileName)\n returns a sheet object representing \"Sheet 1\"\n\n your google account doesn't have to the owner of this sheet, as long as you can edit it.\n but you need to initialize/create this sheet and maybe the header by hand to begin with\n the header could have spaces, ? etc, and when they are used as the keywords of dictionary, they are all converted to lowercase and all illegal characters are removed e.g. Delayed Test_date? --> delayedtestdate\n\n fileName should be unique, can have spaces\n\n\nGetRows(query=None, order_by=None,\n reverse=None, filter_func=None)\n :param query:\n A string structured query on the full text in the worksheet.\n [columnName][binaryOperator][value]\n Supported binaryOperators are:\n - (), for overriding order of operations\n - = or ==, for strict equality\n - <> or !=, for strict inequality\n - and or &&, for boolean and\n - or or ||, for boolean or.\n :param order_by:\n A string which specifies what column to use in ordering the\n entries in the feed. By position (the default): 'position' returns\n rows in the order in which they appear in the GUI. Row 1, then\n row 2, then row 3, and so on. By column:\n 'column:columnName' sorts rows in ascending order based on the\n values in the column with the given columnName, where\n columnName is the value in the header row for that column.\n :param reverse:\n A string which specifies whether to sort in descending or ascending\n order.Reverses default sort order: 'true' results in a descending\n sort; 'false' (the default) results in an ascending sort.\n :param filter_func:\n A lambda function which applied to each row, Gets a row dict as\n argument and returns True or False. Used for filtering rows in\n memory (as opposed to query which filters on the service side).\n :return:\n A list of row dictionaries.\n\n\nUpdateRow(row_data):\n Update Row (By ID).\n\n Only the fields supplied will be updated.\n :param row_data:\n A dictionary containing row data. The row will be updated according\n to the value in the ID_FIELD.\n :return:\n The updated row.\n\n\nUpdateRowByIndex(index, row_data):\n Update Row By Index\n\n :param index:\n An integer designating the index of a row to update (zero based).\n Index is relative to the returned result set, not to the original\n spreadseet.\n :param row_data:\n A dictionary containing row data.\n :return:\n The updated row.\n\n\nInsertRow(row_data):\n Append Row at the end\n\n :param row_data:\n A dictionary containing row data.\n :return:\n A row dictionary for the inserted row.\n\n\nDeleteRow(row):\n Delete Row (By ID).\n\n Requires that the given row dictionary contains an ID_FIELD.\n :param row:\n A row dictionary to delete.\n\n\nDeleteRowByIndex(index):\n Delete Row By Index\n\n :param index:\n A row index. Index is relative to the returned result set, not to\n the original spreadsheet.\n\n\nDeleteAllRows():\n Delete All Rows\n\n\n\n\n\nAttributes:\n name\n url\n html # html code\nMethods:\n __init__(source, render=False, name=None)\n # source could be url or string code\n # render requires wx/webkit to parse html\n # internally update the scraper object's attributes (e.g. url, html)\n xpath(xpath, first=False) # first=False returns all matched as a list; first=True, first matched as string\n\nExamples:\n / = root, // = all, [] = constriction, @ = attributes\n\n s = Scraper('
abcLINK 1
LINK 2def
abc
ghi
LINK 3jkl
')\n \n print s.xpath('/div/a')\n # ['LINK 1', 'LINK 3']\n\n print s.xpath('/div/a[@class=\"link\"]')\n # ['LINK 1']\n\n print s.xpath('/div[1]//a')\n # ['LINK 1', 'LINK 2']\n\n print s.xpath('/div/a/@class')\n # ['link', '']\n\n print s.xpath('/div[-1]/a')\n # ['LINK 3']\n\n s = Scraper(u'google')\n print s.xpath('//a[@class=\"flink\"]', 1)\n # 'google'\n\n # test finding just the first instance for a large amount of content\n s = Scraper('
content
' * 10000)\n print s.xpath('//span', 1)\n # 'content'\n\n # test extracting attribute of self closing tag\n s = Scraper('
')\n print s.xpath('/div/img/@src', 1)\n # 'img.png'\n\n # test extracting attribute after self closing tag\n s = Scraper('

content

')\n print s.xpath('/div/p')\n # 'content'\n\nSample:\n import time\n COL_NAME = \"Words_And_Idioms\"\n\n output = open(COL_NAME+\".txt\", 'w')\n\n for i in range(1,2):\n first = Scraper(\"http://www.51voa.com/\"+COL_NAME+\"_\"+str(i)+\".html\")\n time.sleep(1)\n lists = first.xpath(\"//li\")\n for item in lists:\n if \"/Voa_English_Learning/\" in item:\n temp = Scraper(item)\n time.sleep(1)\n link = \"http://www.51voa.com\"+temp.xpath(\"/@href\",1)\n second = Scraper(link)\n time.sleep(1)\n try:\n download = re.search(\"/.*/.*mp3\", second.html).group(0)\n except:\n download = \"missing\"\n print >> output, \"http://stream.51voa.com\"+download\n output.flush()", "description_content_type": null, "docs_url": null, "download_url": "UNKNOWN", "downloads": { "last_day": -1, "last_month": -1, "last_week": -1 }, "home_page": "https://pypi.python.org/pypi/webly", "keywords": "web,cross-platform,scrape,rss,json,xpath,wrapper", "license": "MIT", "maintainer": null, "maintainer_email": null, "name": "webly", "package_url": "https://pypi.org/project/webly/", "platform": "UNKNOWN", "project_url": "https://pypi.org/project/webly/", "project_urls": { "Download": "UNKNOWN", "Homepage": "https://pypi.python.org/pypi/webly" }, "release_url": "https://pypi.org/project/webly/0.0.1/", "requires_dist": null, "requires_python": null, "summary": "web related library", "version": "0.0.1" }, "last_serial": 1609520, "releases": { "0.0.1": [] }, "urls": [] }