PKI\I site/conf.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2015-11-10 01:31:54 # Modified on 2016-10-26 20:46:20 import sys from unittest.mock import MagicMock from recommonmark.parser import CommonMarkParser class Mock(MagicMock): @classmethod def __getattr__(cls, name): return Mock() MOCK_MODULES = ['pycurl', 'lxml', 'psycopg2'] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) source_parsers = { '.md': CommonMarkParser, } source_suffix = ['.rst', '.md'] PKI\Isite/__init__.pyPKd6[Ibqqweblocust/logging.conf[loggers] keys=root,scheduler,fetcher,processor,webui,bench,werkzeug [logger_root] level=INFO handlers=screen [logger_scheduler] level=INFO handlers=screen qualname=scheduler propagate=0 [logger_fetcher] level=DEBUG handlers=screen qualname=fetcher propagate=0 [logger_processor] level=DEBUG handlers=screen qualname=processor propagate=0 [logger_webui] level=DEBUG handlers=screen qualname=webui propagate=0 [logger_bench] level=DEBUG handlers=screen qualname=bench propagate=0 [logger_werkzeug] level=DEBUG handlers=screen qualname=werkzeug propagate=0 [handlers] keys=screen,txtfile [handler_screen] class=logging.StreamHandler formatter=pretty level=WARNING args=(sys.stdout, ) [handler_txtfile] class=logging.FileHandler formatter=pretty level=WARNING args=("./spiderRuntime.log",) [formatters] keys=pretty [formatter_pretty] class=weblocust.libs.log.LogFormatter PKn[Ixvvweblocust/run.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-03-05 00:11:49 # Modified on 2016-10-26 20:46:20 import os import sys import six import copy import time import shutil import logging import logging.config import click import weblocust from weblocust.message_queue import connect_message_queue from weblocust.database import connect_database from weblocust.libs import utils __version__=1.0 __program_name__ = "weblocust" def read_config(ctx, param, value): if not value: return {} import json def underline_dict(d): if not isinstance(d, dict): return d return dict((k.replace('-', '_'), underline_dict(v)) for k, v in six.iteritems(d)) config = underline_dict(json.load(value)) ctx.default_map = config return config def connect_db(ctx, param, value): if not value: return return utils.Get(lambda: connect_database(value)) def load_cls(ctx, param, value): if isinstance(value, six.string_types): return utils.load_object(value) return value def connect_rpc(ctx, param, value): if not value: return try: from six.moves import xmlrpc_client except ImportError: import xmlrpclib as xmlrpc_client return xmlrpc_client.ServerProxy(value, allow_none=True) @click.group(invoke_without_command=True) @click.option('-c', '--config', callback=read_config, type=click.File('r'), help='a json file with default values for subcommands. {"webui": {"port":5001}}') @click.option('--logging-config', default=os.path.join(os.path.dirname(__file__), "logging.conf"), help="logging config file for built-in python logging module", show_default=True) @click.option('--debug', envvar='DEBUG', default=False, is_flag=True, help='debug mode') @click.option('--queue-maxsize', envvar='QUEUE_MAXSIZE', default=100, help='maxsize of queue') @click.option('--taskdb', envvar='TASKDB', callback=connect_db, help='database url for taskdb, default: sqlite') @click.option('--projectdb', envvar='PROJECTDB', callback=connect_db, help='database url for projectdb, default: sqlite') @click.option('--resultdb', envvar='RESULTDB', callback=connect_db, help='database url for resultdb, default: sqlite') @click.option('--message-queue', envvar='AMQP_URL', help='connection url to message queue, ' 'default: builtin multiprocessing.Queue') @click.option('--amqp-url', help='[deprecated] amqp url for rabbitmq. ' 'please use --message-queue instead.') @click.option('--beanstalk', envvar='BEANSTALK_HOST', help='[deprecated] beanstalk config for beanstalk queue. ' 'please use --message-queue instead.') @click.option('--phantomjs-proxy', envvar='PHANTOMJS_PROXY', help="phantomjs proxy ip:port") @click.option('--data-path', default='./data', help='data dir path') @click.option('--add-sys-path/--not-add-sys-path', default=True, is_flag=True, help='add current working directory to python lib search path') @click.version_option(version=__version__, prog_name=__program_name__) @click.pass_context def cli(ctx, **kwargs): """ A powerful spider system in python. """ if kwargs['add_sys_path']: sys.path.append(os.getcwd()) logging.config.fileConfig(kwargs['logging_config']) # get db from env for db in ('taskdb', 'projectdb', 'resultdb'): if kwargs[db] is not None: continue if os.environ.get('MYSQL_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'sqlalchemy+mysql+%s://%s:%s/%s' % ( db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ['MYSQL_PORT_3306_TCP_PORT'], db))) elif os.environ.get('MONGODB_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'mongodb+%s://%s:%s/%s' % ( db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ['MONGODB_PORT_27017_TCP_PORT'], db))) elif ctx.invoked_subcommand == 'bench': if kwargs['data_path'] == './data': kwargs['data_path'] += '/bench' shutil.rmtree(kwargs['data_path'], ignore_errors=True) os.mkdir(kwargs['data_path']) if db in ('taskdb', 'resultdb'): kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db))) else: kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) else: if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) kwargs['is_%s_default' % db] = True # create folder for counter.dump if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) # message queue, compatible with old version if kwargs.get('message_queue'): pass elif kwargs.get('amqp_url'): kwargs['message_queue'] = kwargs['amqp_url'] elif os.environ.get('RABBITMQ_NAME'): kwargs['message_queue'] = ("amqp://guest:guest@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ) elif kwargs.get('beanstalk'): kwargs['message_queue'] = "beanstalk://%s/" % kwargs['beanstalk'] for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): if kwargs.get('message_queue'): kwargs[name] = utils.Get(lambda name=name: connect_message_queue( name, kwargs.get('message_queue'), kwargs['queue_maxsize'])) else: kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'), kwargs['queue_maxsize']) # phantomjs-proxy if kwargs.get('phantomjs_proxy'): pass elif os.environ.get('PHANTOMJS_NAME'): kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):] ctx.obj = utils.ObjectDict(ctx.obj or {}) ctx.obj['instances'] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'): ctx.invoke(all) return ctx @cli.command() @click.option('--xmlrpc/--no-xmlrpc', default=True) @click.option('--xmlrpc-host', default='0.0.0.0') @click.option('--xmlrpc-port', envvar='SCHEDULER_XMLRPC_PORT', default=23333) @click.option('--inqueue-limit', default=0, help='size limit of task queue for each project, ' 'tasks will been ignored when overflow') @click.option('--delete-time', default=3* 60, help='delete time before marked as delete') @click.option('--active-tasks', default=100, help='active log size') @click.option('--loop-limit', default=1000, help='maximum number of tasks due with in a loop') @click.option('--scheduler-cls', default='weblocust.scheduler.ThreadBaseScheduler', callback=load_cls, help='scheduler class to be used.') @click.option('--threads', default=None, help='thread number for ThreadBaseScheduler, default: 4') @click.pass_context def scheduler(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, inqueue_limit, delete_time, active_tasks, loop_limit, scheduler_cls, threads): """ Run Scheduler, only one scheduler is allowed. """ g = ctx.obj Scheduler = load_cls(None, None, scheduler_cls) kwargs = dict(taskdb=g.taskdb, projectdb=g.projectdb, resultdb=g.resultdb, newtask_queue=g.newtask_queue, status_queue=g.status_queue, out_queue=g.scheduler2fetcher, data_path=g.get('data_path', 'data')) if threads: kwargs['threads'] = int(threads) scheduler = Scheduler(**kwargs) scheduler.INQUEUE_LIMIT = inqueue_limit scheduler.DELETE_TIME = delete_time scheduler.ACTIVE_TASKS = active_tasks scheduler.LOOP_LIMIT = loop_limit g.instances.append(scheduler) if g.get('testing_mode'): return scheduler if xmlrpc: utils.run_in_thread(scheduler.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host) scheduler.run() @cli.command() @click.option('--xmlrpc/--no-xmlrpc', default=False) @click.option('--xmlrpc-host', default='0.0.0.0') @click.option('--xmlrpc-port', envvar='FETCHER_XMLRPC_PORT', default=24444) @click.option('--poolsize', default=100, help="max simultaneous fetches") @click.option('--proxy', help="proxy host:port") @click.option('--user-agent', help='user agent') @click.option('--timeout', help='default fetch timeout') @click.option('--fetcher-cls', default='weblocust.fetcher.Fetcher', callback=load_cls, help='Fetcher class to be used.') @click.pass_context def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent, timeout, fetcher_cls, async=True): """ Run Fetcher. """ g = ctx.obj Fetcher = load_cls(None, None, fetcher_cls) fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor, poolsize=poolsize, proxy=proxy, async=async) fetcher.phantomjs_proxy = g.phantomjs_proxy if user_agent: fetcher.user_agent = user_agent if timeout: fetcher.default_options = copy.deepcopy(fetcher.default_options) fetcher.default_options['timeout'] = timeout g.instances.append(fetcher) if g.get('testing_mode'): return fetcher if xmlrpc: utils.run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host) fetcher.run() @cli.command() @click.option('--processor-cls', default='weblocust.processor.Processor', callback=load_cls, help='Processor class to be used.') @click.pass_context def processor(ctx, processor_cls, enable_stdout_capture=True): """ Run Processor. """ g = ctx.obj Processor = load_cls(None, None, processor_cls) processor = Processor(projectdb=g.projectdb, inqueue=g.fetcher2processor, status_queue=g.status_queue, newtask_queue=g.newtask_queue, result_queue=g.processor2result, enable_stdout_capture=enable_stdout_capture) g.instances.append(processor) if g.get('testing_mode'): return processor processor.run() @cli.command() @click.option('--result-cls', default='weblocust.result.ResultWorker', callback=load_cls, help='ResultWorker class to be used.') @click.pass_context def result_worker(ctx, result_cls): """ Run result worker. """ g = ctx.obj ResultWorker = load_cls(None, None, result_cls) result_worker = ResultWorker(resultdb=g.resultdb, inqueue=g.processor2result) g.instances.append(result_worker) if g.get('testing_mode'): return result_worker result_worker.run() @cli.command() @click.option('--host', default='0.0.0.0', envvar='WEBUI_HOST', help='webui bind to host') @click.option('--port', default=5000, envvar='WEBUI_PORT', help='webui bind to host') @click.option('--cdn', default='//cdnjscn.b0.upaiyun.com/libs/', help='js/css cdn server') @click.option('--scheduler-rpc', help='xmlrpc path of scheduler') @click.option('--fetcher-rpc', help='xmlrpc path of fetcher') @click.option('--max-rate', type=float, help='max rate for each project',default=8.0) @click.option('--max-burst', type=float, help='max burst for each project',default=5.0) @click.option('--username', envvar='WEBUI_USERNAME', help='username of lock -ed projects') @click.option('--password', envvar='WEBUI_PASSWORD', help='password of lock -ed projects') @click.option('--need-auth', is_flag=True, default=False, help='need username and password') @click.option('--webui-instance', default='weblocust.webui.app.app', callback=load_cls, help='webui Flask Application instance to be used.') @click.pass_context def webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc, max_rate, max_burst, username, password, need_auth, webui_instance): """ Run WebUI """ app = load_cls(None, None, webui_instance) g = ctx.obj app.config['taskdb'] = g.taskdb app.config['projectdb'] = g.projectdb app.config['resultdb'] = g.resultdb app.config['cdn'] = cdn if max_rate: app.config['max_rate'] = max_rate if max_burst: app.config['max_burst'] = max_burst if username: app.config['webui_username'] = username if password: app.config['webui_password'] = password app.config['need_auth'] = need_auth # inject queues for webui for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): app.config['queues'][name] = getattr(g, name, None) # fetcher rpc if isinstance(fetcher_rpc, six.string_types): import umsgpack fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc) app.config['fetch'] = lambda x: umsgpack.unpackb(fetcher_rpc.fetch(x).data) else: # get fetcher instance for webui fetcher_config = g.config.get('fetcher', {}) scheduler2fetcher = g.scheduler2fetcher fetcher2processor = g.fetcher2processor testing_mode = g.get('testing_mode', False) g['scheduler2fetcher'] = None g['fetcher2processor'] = None g['testing_mode'] = True webui_fetcher = ctx.invoke(fetcher, async=False, **fetcher_config) g['scheduler2fetcher'] = scheduler2fetcher g['fetcher2processor'] = fetcher2processor g['testing_mode'] = testing_mode app.config['fetch'] = lambda x: webui_fetcher.fetch(x) if isinstance(scheduler_rpc, six.string_types): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'): app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://%s/' % ( os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):])) elif scheduler_rpc is None: app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://127.0.0.1:23333/') else: app.config['scheduler_rpc'] = scheduler_rpc app.debug = g.debug g.instances.append(app) if g.get('testing_mode'): return app app.run(host=host, port=port) @cli.command() @click.option('--phantomjs-path', default='phantomjs', help='phantomjs path') @click.option('--port', default=25555, help='phantomjs port') @click.option('--auto-restart', default=False, help='auto restart phantomjs if crashed') @click.argument('args', nargs=-1) @click.pass_context def phantomjs(ctx, phantomjs_path, port, auto_restart, args): """ Run phantomjs fetcher if phantomjs is installed. """ args = args or ctx.default_map and ctx.default_map.get('args', []) import subprocess g = ctx.obj _quit = [] phantomjs_fetcher = os.path.join( os.path.dirname(weblocust.__file__), 'fetcher/phantomjs_fetcher.js') cmd = [phantomjs_path, # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903 #'--load-images=false', '--ssl-protocol=any', '--disk-cache=true'] + list(args or []) + [phantomjs_fetcher, str(port)] try: _phantomjs = subprocess.Popen(cmd) except OSError: logging.warning('phantomjs not found, continue running without it.') return None def quit(*args, **kwargs): _quit.append(1) _phantomjs.kill() _phantomjs.wait() logging.info('phantomjs existed.') if not g.get('phantomjs_proxy'): g['phantomjs_proxy'] = '127.0.0.1:%s' % port phantomjs = utils.ObjectDict(port=port, quit=quit) g.instances.append(phantomjs) if g.get('testing_mode'): return phantomjs while True: _phantomjs.wait() if _quit or not auto_restart: break _phantomjs = subprocess.Popen(cmd) @cli.command() @click.option('--fetcher-num', default=1, help='instance num of fetcher') @click.option('--processor-num', default=1, help='instance num of processor') @click.option('--result-worker-num', default=1, help='instance num of result worker') @click.option('--run-in', default='subprocess', type=click.Choice(['subprocess', 'thread']), help='run each components in thread or subprocess. ' 'always using thread for windows.') @click.pass_context def all(ctx, fetcher_num, processor_num, result_worker_num, run_in): """ Run all the components in subprocess or thread """ ctx.obj['debug'] = False g = ctx.obj # FIXME: py34 cannot run components with threads if run_in == 'subprocess' and os.name != 'nt': run_in = utils.run_in_subprocess else: run_in = utils.run_in_thread threads = [] try: # phantomjs if not g.get('phantomjs_proxy'): phantomjs_config = g.config.get('phantomjs', {}) phantomjs_config.setdefault('auto_restart', True) threads.append(run_in(ctx.invoke, phantomjs, **phantomjs_config)) time.sleep(2) if threads[-1].is_alive() and not g.get('phantomjs_proxy'): g['phantomjs_proxy'] = '127.0.0.1:%s' % phantomjs_config.get('port', 25555) # result worker result_worker_config = g.config.get('result_worker', {}) for i in range(result_worker_num): threads.append(run_in(ctx.invoke, result_worker, **result_worker_config)) # processor processor_config = g.config.get('processor', {}) for i in range(processor_num): threads.append(run_in(ctx.invoke, processor, **processor_config)) # fetcher fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc_host', '127.0.0.1') for i in range(fetcher_num): threads.append(run_in(ctx.invoke, fetcher, **fetcher_config)) # scheduler scheduler_config = g.config.get('scheduler', {}) scheduler_config.setdefault('xmlrpc_host', '127.0.0.1') threads.append(run_in(ctx.invoke, scheduler, **scheduler_config)) # running webui in main thread to make it exitable webui_config = g.config.get('webui', {}) webui_config.setdefault('scheduler_rpc', 'http://127.0.0.1:%s/' % g.config.get('scheduler', {}).get('xmlrpc_port', 23333)) ctx.invoke(webui, **webui_config) finally: # exit components run in threading for each in g.instances: each.quit() # exit components run in subprocess for each in threads: if not each.is_alive(): continue if hasattr(each, 'terminate'): each.terminate() each.join() @cli.command() @click.option('--fetcher-num', default=1, help='instance num of fetcher') @click.option('--processor-num', default=2, help='instance num of processor') @click.option('--result-worker-num', default=1, help='instance num of result worker') @click.option('--run-in', default='subprocess', type=click.Choice(['subprocess', 'thread']), help='run each components in thread or subprocess. ' 'always using thread for windows.') @click.option('--total', default=10000, help="total url in test page") @click.option('--show', default=20, help="show how many urls in a page") @click.option('--taskdb-bench', default=False, is_flag=True, help="only run taskdb bench test") @click.option('--message-queue-bench', default=False, is_flag=True, help="only run message queue bench test") @click.option('--all-bench', default=False, is_flag=True, help="only run all bench test") @click.pass_context def bench(ctx, fetcher_num, processor_num, result_worker_num, run_in, total, show, taskdb_bench, message_queue_bench, all_bench): """ Run Benchmark test. In bench mode, in-memory sqlite database is used instead of on-disk sqlite database. """ from weblocust.libs import bench from weblocust.webui import bench_test # flake8: noqa ctx.obj['debug'] = False g = ctx.obj if result_worker_num == 0: g['processor2result'] = None if run_in == 'subprocess' and os.name != 'nt': run_in = utils.run_in_subprocess else: run_in = utils.run_in_thread all_test = not taskdb_bench and not message_queue_bench and not all_bench # test taskdb if all_test or taskdb_bench: bench.bench_test_taskdb(g.taskdb) # test message queue if all_test or message_queue_bench: bench.bench_test_message_queue(g.scheduler2fetcher) # test all if not all_test and not all_bench: return project_name = '__bench_test__' def clear_project(): g.taskdb.drop(project_name) g.projectdb.drop(project_name) g.resultdb.drop(project_name) clear_project() g.projectdb.insert(project_name, { 'name': project_name, 'status': 'RUNNING', 'script': bench.bench_script % {'total': total, 'show': show}, 'rate': total, 'burst': total, 'updatetime': time.time() }) # disable log logging.getLogger().setLevel(logging.ERROR) logging.getLogger('scheduler').setLevel(logging.ERROR) logging.getLogger('fetcher').setLevel(logging.ERROR) logging.getLogger('processor').setLevel(logging.ERROR) logging.getLogger('result').setLevel(logging.ERROR) logging.getLogger('webui').setLevel(logging.ERROR) logging.getLogger('werkzeug').setLevel(logging.ERROR) try: threads = [] # result worker result_worker_config = g.config.get('result_worker', {}) for i in range(result_worker_num): threads.append(run_in(ctx.invoke, result_worker, result_cls='weblocust.libs.bench.BenchResultWorker', **result_worker_config)) # processor processor_config = g.config.get('processor', {}) for i in range(processor_num): threads.append(run_in(ctx.invoke, processor, processor_cls='weblocust.libs.bench.BenchProcessor', **processor_config)) # fetcher fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc_host', '127.0.0.1') for i in range(fetcher_num): threads.append(run_in(ctx.invoke, fetcher, fetcher_cls='weblocust.libs.bench.BenchFetcher', **fetcher_config)) # scheduler scheduler_config = g.config.get('scheduler', {}) scheduler_config.setdefault('xmlrpc_host', '127.0.0.1') scheduler_config.setdefault('xmlrpc_port', 23333) threads.append(run_in(ctx.invoke, scheduler, scheduler_cls='weblocust.libs.bench.BenchScheduler', **scheduler_config)) scheduler_rpc = connect_rpc(ctx, None, 'http://%(xmlrpc_host)s:%(xmlrpc_port)s/' % scheduler_config) # webui webui_config = g.config.get('webui', {}) webui_config.setdefault('scheduler_rpc', 'http://127.0.0.1:%s/' % g.config.get('scheduler', {}).get('xmlrpc_port', 23333)) threads.append(run_in(ctx.invoke, webui, **webui_config)) # wait bench test finished while True: time.sleep(1) if scheduler_rpc.size() == 0: break finally: # exit components run in threading for each in g.instances: each.quit() # exit components run in subprocess for each in threads: if hasattr(each, 'terminate'): each.terminate() each.join(1) clear_project() @cli.command() @click.option('-i', '--interactive', default=False, is_flag=True, help='enable interactive mode, you can choose crawl url.') @click.option('--phantomjs', 'enable_phantomjs', default=False, is_flag=True, help='enable phantomjs, will spawn a subprocess for phantomjs') @click.argument('scripts', nargs=-1) @click.pass_context def one(ctx, interactive, enable_phantomjs, scripts): """ One mode not only means all-in-one, it runs every thing in one process over tornado.ioloop, for debug purpose """ ctx.obj['debug'] = False g = ctx.obj g['testing_mode'] = True if scripts: from weblocust.database.local.projectdb import ProjectDB g['projectdb'] = ProjectDB(scripts) if g.get('is_taskdb_default'): g['taskdb'] = connect_database('sqlite+taskdb://') if g.get('is_resultdb_default'): g['resultdb'] = None if enable_phantomjs: phantomjs_config = g.config.get('phantomjs', {}) phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config) if phantomjs_obj: g.setdefault('phantomjs_proxy', '127.0.0.1:%s' % phantomjs_obj.port) else: phantomjs_obj = None result_worker_config = g.config.get('result_worker', {}) if g.resultdb is None: result_worker_config.setdefault('result_cls', 'weblocust.result.OneResultWorker') result_worker_obj = ctx.invoke(result_worker, **result_worker_config) processor_config = g.config.get('processor', {}) processor_config.setdefault('enable_stdout_capture', False) processor_obj = ctx.invoke(processor, **processor_config) fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc', False) fetcher_obj = ctx.invoke(fetcher, **fetcher_config) scheduler_config = g.config.get('scheduler', {}) scheduler_config.setdefault('xmlrpc', False) scheduler_config.setdefault('scheduler_cls', 'weblocust.scheduler.OneScheduler') scheduler_obj = ctx.invoke(scheduler, **scheduler_config) scheduler_obj.init_one(ioloop=fetcher_obj.ioloop, fetcher=fetcher_obj, processor=processor_obj, result_worker=result_worker_obj, interactive=interactive) if scripts: for project in g.projectdb.projects: scheduler_obj.trigger_on_start(project) try: scheduler_obj.run() finally: scheduler_obj.quit() if phantomjs_obj: phantomjs_obj.quit() @cli.command() @click.option('--scheduler-rpc', callback=connect_rpc, help='xmlrpc path of scheduler') @click.argument('project', nargs=1) @click.argument('message', nargs=1) @click.pass_context def send_message(ctx, scheduler_rpc, project, message): """ Send Message to project from command line """ if isinstance(scheduler_rpc, six.string_types): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'): scheduler_rpc = connect_rpc(ctx, None, 'http://%s/' % ( os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):])) if scheduler_rpc is None: scheduler_rpc = connect_rpc(ctx, None, 'http://127.0.0.1:23333/') return scheduler_rpc.send_task({ 'taskid': utils.md5string('data:,on_message'), 'project': project, 'url': 'data:,on_message', 'fetch': { 'save': ('__command__', message), }, 'process': { 'callback': '_on_message', } }) @cli.command() @click.option("--filename",prompt="configure file name",help="configure file name",default="configure") @click.option("--mongo-host",prompt="mongodb hostname or ip address",help="mongo host or ip",default="localhost") @click.option("--redis-host",prompt="redis hostname or ip address",help="redis host or ip",default="localhost") @click.pass_context def mkconfig(context,filename,mongo_host,redis_host): """ generate simple configure file """ script = """ { "taskdb": "mongodb+taskdb://__MONGO_HOST__:27017/weblocust_task", "projectdb": "mongodb+projectdb://__MONGO_HOST__:27017/weblocust_project", "resultdb": "mongodb+resultdb://__MONGO_HOST__:27017/weblocust_result", "message_queue": "redis://__REDIS_HOST__:6379/0", "result_worker": { "result_cls": "weblocust.result.AdvanceResultWorker" }, "fetcher":{ "timeout":8 }, "webui": { "username": "admin", "password": "admin", "need-auth": false }, "scheduler": { "delete_time": 60 } } """ configure_file_name = filename new_script = script.replace("__MONGO_HOST__",mongo_host).replace("__REDIS_HOST__",redis_host) with open(configure_file_name,'w') as f: f.write(new_script) click.echo("\nconfigure file is generated as %s\nrun command `weblocust -c %s`"%(filename,filename)) @cli.command() @click.pass_context def phantomsource(context): """ get phantomjs source code if you want to run phantomjs seperately,you can get the related js code """ phantomjs_code = os.path.join(os.path.dirname(__file__),"fetcher","phantomjs_fetcher.js") with open(phantomjs_code,'r') as source_code: with open("phantomjs_proxy.js",'w') as targetfile: targetfile.write(source_code.read()) click.echo("phantomjs_proxy source code is generated,open 'phantomjs_proxy.js' to see how to use it") def main(): cli() if __name__ == '__main__': main() PKK\IfVVweblocust/__init__.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-11-17 19:17:12 # Modified on 2016-10-26 20:46:20 # 基于spider 0.38 __version__ = '1.0.2' PKd6[Id!weblocust/result/result_worker.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-10-19 15:37:46 # Modified on 2016-10-26 20:46:20 import time import json import logging from six.moves import queue as Queue from inspect import isgeneratorfunction from collections import Iterable logger = logging.getLogger("result") class ResultWorker(object): """ do with result override this if needed. """ def __init__(self, resultdb, inqueue): self.resultdb = resultdb self.inqueue = inqueue self._quit = False def on_result(self, task, result): '''Called every result''' if not result: return if 'taskid' in task and 'project' in task and 'url' in task: logger.info('result %s:%s %s -> %.30r' % ( task['project'], task['taskid'], task['url'], result)) taskid = "{taskid}-{__multi__}".format(taskid=task['taskid'],__multi__=result["__multi__"]) \ if result.has_key("__multi__") else task["taskid"] return self.resultdb.save( project=task['project'], taskid=taskid, url=task['url'], result=result ) else: logger.warning('result UNKNOW -> %.30r' % result) return def quit(self): self._quit = True def run(self): '''Run loop''' logger.info("result_worker starting...") while not self._quit: try: task, result = self.inqueue.get(timeout=1) self.on_result(task, result) except Queue.Empty as e: continue except KeyboardInterrupt: break except AssertionError as e: logger.error(e) continue except Exception as e: logger.exception(e) continue logger.info("result_worker exiting...") class OneResultWorker(ResultWorker): '''Result Worker for one mode, write results to stdout''' def on_result(self, task, result): '''Called every result''' if not result: return if 'taskid' in task and 'project' in task and 'url' in task: logger.info('result %s:%s %s -> %.30r' % ( task['project'], task['taskid'], task['url'], result)) print(json.dumps({ 'taskid': task['taskid'], 'project': task['project'], 'url': task['url'], 'result': result, 'updatetime': time.time() })) else: logger.warning('result UNKNOW -> %.30r' % result) return class AdvanceResultWorker(ResultWorker): def on_result(self,task,result): '''Called every result''' if not result: return if 'taskid' in task and 'project' in task and 'url' in task: logger.info('result %s:%s %s -> %.30r' % ( task['project'], task['taskid'], task['url'], result)) taskid = task["taskid"] # use advance save return self.resultdb.asave( project=task['project'], taskid=taskid, url=task['url'], result=result ) else: logger.warning('result UNKNOW -> %.30r' % result) return PKd6[Iyyweblocust/result/__init__.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-10-19 16:10:19 # Modified on 2016-10-26 20:46:20 from .result_worker import ResultWorker, OneResultWorker,AdvanceResultWorker PKd6[IP^weblocust/database/__init__.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-10-08 15:04:08 # Modified on 2016-10-26 20:46:20 from six.moves.urllib.parse import urlparse, parse_qs def connect_database(url): """ create database object by url mysql: mysql+type://user:passwd@host:port/database sqlite: # relative path sqlite+type:///path/to/database.db # absolute path sqlite+type:////path/to/database.db # memory database sqlite+type:// mongodb: mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]] more: http://docs.mongodb.org/manual/reference/connection-string/ sqlalchemy: sqlalchemy+postgresql+type://user:passwd@host:port/database sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database more: http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html redis: redis+taskdb://host:port/db elasticsearch: elasticsearch+type://host:port/?index=weblocust local: local+projectdb://filepath,filepath type: taskdb projectdb resultdb """ db = _connect_database(url) db.copy = lambda: _connect_database(url) return db def _connect_database(url): # NOQA parsed = urlparse(url) scheme = parsed.scheme.split('+') if len(scheme) == 1: raise Exception('wrong scheme format: %s' % parsed.scheme) else: engine, dbtype = scheme[0], scheme[-1] other_scheme = "+".join(scheme[1:-1]) if dbtype not in ('taskdb', 'projectdb', 'resultdb'): raise LookupError('unknown database type: %s, ' 'type should be one of ["taskdb", "projectdb", "resultdb"]', dbtype) if engine == 'mysql': parames = {} if parsed.username: parames['user'] = parsed.username if parsed.password: parames['passwd'] = parsed.password if parsed.hostname: parames['host'] = parsed.hostname if parsed.port: parames['port'] = parsed.port if parsed.path.strip('/'): parames['database'] = parsed.path.strip('/') if dbtype == 'taskdb': from .mysql.taskdb import TaskDB return TaskDB(**parames) elif dbtype == 'projectdb': from .mysql.projectdb import ProjectDB return ProjectDB(**parames) elif dbtype == 'resultdb': from .mysql.resultdb import ResultDB return ResultDB(**parames) else: raise LookupError elif engine == 'sqlite': if parsed.path.startswith('//'): path = '/' + parsed.path.strip('/') elif parsed.path.startswith('/'): path = './' + parsed.path.strip('/') elif not parsed.path: path = ':memory:' else: raise Exception('error path: %s' % parsed.path) if dbtype == 'taskdb': from .sqlite.taskdb import TaskDB return TaskDB(path) elif dbtype == 'projectdb': from .sqlite.projectdb import ProjectDB return ProjectDB(path) elif dbtype == 'resultdb': from .sqlite.resultdb import ResultDB return ResultDB(path) else: raise LookupError elif engine == 'mongodb': url = url.replace(parsed.scheme, 'mongodb') parames = {} if parsed.path.strip('/'): parames['database'] = parsed.path.strip('/') if dbtype == 'taskdb': from .mongodb.taskdb import TaskDB return TaskDB(url, **parames) elif dbtype == 'projectdb': from .mongodb.projectdb import ProjectDB return ProjectDB(url, **parames) elif dbtype == 'resultdb': from .mongodb.resultdb import ResultDB return ResultDB(url, **parames) else: raise LookupError elif engine == 'sqlalchemy': if not other_scheme: raise Exception('wrong scheme format: %s' % parsed.scheme) url = url.replace(parsed.scheme, other_scheme) if dbtype == 'taskdb': from .sqlalchemy.taskdb import TaskDB return TaskDB(url) elif dbtype == 'projectdb': from .sqlalchemy.projectdb import ProjectDB return ProjectDB(url) elif dbtype == 'resultdb': from .sqlalchemy.resultdb import ResultDB return ResultDB(url) else: raise LookupError elif engine == 'redis': if dbtype == 'taskdb': from .redis.taskdb import TaskDB return TaskDB(parsed.hostname, parsed.port, int(parsed.path.strip('/') or 0)) else: raise LookupError('not supported dbtype: %s', dbtype) elif engine == 'local': scripts = url.split('//', 1)[1].split(',') if dbtype == 'projectdb': from .local.projectdb import ProjectDB return ProjectDB(scripts) else: raise LookupError('not supported dbtype: %s', dbtype) elif engine == 'elasticsearch' or engine == 'es': index = parse_qs(parsed.query) if 'index' in index and index['index']: index = index['index'][0] else: index = 'weblocust' if dbtype == 'projectdb': from .elasticsearch.projectdb import ProjectDB return ProjectDB([parsed.netloc], index=index) elif dbtype == 'resultdb': from .elasticsearch.resultdb import ResultDB return ResultDB([parsed.netloc], index=index) elif dbtype == 'taskdb': from .elasticsearch.taskdb import TaskDB return TaskDB([parsed.netloc], index=index) else: raise Exception('unknown engine: %s' % engine) PKd6[I[  weblocust/database/basedb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2012-08-30 17:43:49 # Modified on 2016-10-26 20:46:20 from __future__ import unicode_literals, division, absolute_import import logging logger = logging.getLogger('database.basedb') from six import itervalues class BaseDB: ''' BaseDB dbcur should be overwirte ''' __tablename__ = None placeholder = '%s' @staticmethod def escape(string): return '`%s`' % string @property def dbcur(self): raise NotImplementedError def _execute(self, sql_query, values=[]): dbcur = self.dbcur dbcur.execute(sql_query, values) return dbcur def _select(self, tablename=None, what="*", where="", where_values=[], offset=0, limit=None): tablename = self.escape(tablename or self.__tablename__) if isinstance(what, list) or isinstance(what, tuple) or what is None: what = ','.join(self.escape(f) for f in what) if what else '*' sql_query = "SELECT %s FROM %s" % (what, tablename) if where: sql_query += " WHERE %s" % where if limit: sql_query += " LIMIT %d, %d" % (offset, limit) logger.debug("", sql_query) for row in self._execute(sql_query, where_values): yield row def _select2dic(self, tablename=None, what="*", where="", where_values=[], order=None, offset=0, limit=None): tablename = self.escape(tablename or self.__tablename__) if isinstance(what, list) or isinstance(what, tuple) or what is None: what = ','.join(self.escape(f) for f in what) if what else '*' sql_query = "SELECT %s FROM %s" % (what, tablename) if where: sql_query += " WHERE %s" % where if order: sql_query += ' ORDER BY %s' % order if limit: sql_query += " LIMIT %d, %d" % (offset, limit) logger.debug("", sql_query) dbcur = self._execute(sql_query, where_values) fields = [f[0] for f in dbcur.description] for row in dbcur: yield dict(zip(fields, row)) def _replace(self, tablename=None, **values): tablename = self.escape(tablename or self.__tablename__) if values: _keys = ", ".join(self.escape(k) for k in values) _values = ", ".join([self.placeholder, ] * len(values)) sql_query = "REPLACE INTO %s (%s) VALUES (%s)" % (tablename, _keys, _values) else: sql_query = "REPLACE INTO %s DEFAULT VALUES" % tablename logger.debug("", sql_query) if values: dbcur = self._execute(sql_query, list(itervalues(values))) else: dbcur = self._execute(sql_query) return dbcur.lastrowid def _insert(self, tablename=None, **values): tablename = self.escape(tablename or self.__tablename__) if values: _keys = ", ".join((self.escape(k) for k in values)) _values = ", ".join([self.placeholder, ] * len(values)) sql_query = "INSERT INTO %s (%s) VALUES (%s)" % (tablename, _keys, _values) else: sql_query = "INSERT INTO %s DEFAULT VALUES" % tablename logger.debug("", sql_query) if values: dbcur = self._execute(sql_query, list(itervalues(values))) else: dbcur = self._execute(sql_query) return dbcur.lastrowid def _update(self, tablename=None, where="1=0", where_values=[], **values): tablename = self.escape(tablename or self.__tablename__) _key_values = ", ".join([ "%s = %s" % (self.escape(k), self.placeholder) for k in values ]) sql_query = "UPDATE %s SET %s WHERE %s" % (tablename, _key_values, where) logger.debug("", sql_query) return self._execute(sql_query, list(itervalues(values)) + list(where_values)) def _delete(self, tablename=None, where="1=0", where_values=[]): tablename = self.escape(tablename or self.__tablename__) sql_query = "DELETE FROM %s" % tablename if where: sql_query += " WHERE %s" % where logger.debug("", sql_query) return self._execute(sql_query, where_values) if __name__ == "__main__": import sqlite3 class DB(BaseDB): __tablename__ = "test" def __init__(self): self.conn = sqlite3.connect(":memory:") cursor = self.conn.cursor() cursor.execute( '''CREATE TABLE `%s` (id INTEGER PRIMARY KEY AUTOINCREMENT, name, age)''' % self.__tablename__ ) @property def dbcur(self): return self.conn.cursor() db = DB() assert db._insert(db.__tablename__, name="binux", age=23) == 1 assert db._select(db.__tablename__, "name, age").fetchone() == ("binux", 23) assert db._select2dic(db.__tablename__, "name, age")[0]["name"] == "binux" assert db._select2dic(db.__tablename__, "name, age")[0]["age"] == 23 db._replace(db.__tablename__, id=1, age=24) assert db._select(db.__tablename__, "name, age").fetchone() == (None, 24) db._update(db.__tablename__, "id = 1", age=16) assert db._select(db.__tablename__, "name, age").fetchone() == (None, 16) db._delete(db.__tablename__, "id = 1") assert db._select(db.__tablename__).fetchall() == [] PKd6[Iz z -weblocust/database/elasticsearch/projectdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2016-01-17 18:32:33 # Modified on 2016-10-26 20:46:20 import time import elasticsearch.helpers from elasticsearch import Elasticsearch from weblocust.database.base.projectdb import ProjectDB as BaseProjectDB class ProjectDB(BaseProjectDB): __type__ = 'project' def __init__(self, hosts, index='weblocust'): self.index = index self.es = Elasticsearch(hosts=hosts) self.es.indices.create(index=self.index, ignore=400) if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ "_all": {"enabled": False}, "properties": { "updatetime": {"type": "double"} } }) def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() obj.setdefault('group', '') obj.setdefault('status', 'TODO') obj.setdefault('script', '') obj.setdefault('comments', '') obj.setdefault('rate', 0) obj.setdefault('burst', 0) return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name, refresh=True) def update(self, name, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self.es.update(index=self.index, doc_type=self.__type__, body={'doc': obj}, id=name, refresh=True, ignore=404) def get_all(self, fields=None): for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {"match_all": {}}}, _source_include=fields or []): yield record['_source'] def get(self, name, fields=None): ret = self.es.get(index=self.index, doc_type=self.__type__, id=name, _source_include=fields or [], ignore=404) return ret.get('_source', None) def check_update(self, timestamp, fields=None): for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {"range": { "updatetime": {"gte": timestamp} }}}, _source_include=fields or []): yield record['_source'] def drop(self, name): return self.es.delete(index=self.index, doc_type=self.__type__, id=name, refresh=True) PKd6[Iޏ,weblocust/database/elasticsearch/resultdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2016-01-18 19:41:24 # Modified on 2016-10-26 20:46:20 import time import elasticsearch.helpers from elasticsearch import Elasticsearch from weblocust.database.base.resultdb import ResultDB as BaseResultDB class ResultDB(BaseResultDB): __type__ = 'result' def __init__(self, hosts, index='weblocust'): self.index = index self.es = Elasticsearch(hosts=hosts) self.es.indices.create(index=self.index, ignore=400) if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ "_all": {"enabled": True}, "properties": { "taskid": {"enabled": False}, "project": {"type": "string", "index": "not_analyzed"}, "url": {"enabled": False}, } }) @property def projects(self): ret = self.es.search(index=self.index, doc_type=self.__type__, body={"aggs": {"projects": { "terms": {"field": "project"} }}}, _source=False) return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])] def save(self, project, taskid, url, result): obj = { 'taskid': taskid, 'project': project, 'url': url, 'result': result, 'updatetime': time.time(), } return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id='%s:%s' % (project, taskid)) def select(self, project, fields=None, offset=0, limit=0): if not limit: for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {'term': {'project': project}}}, _source_include=fields or [], from_=offset, sort="updatetime:desc"): yield record['_source'] else: for record in self.es.search(index=self.index, doc_type=self.__type__, body={'query': {'term': {'project': project}}}, _source_include=fields or [], from_=offset, size=limit, sort="updatetime:desc" ).get('hits', {}).get('hits', []): yield record['_source'] def count(self, project): return self.es.count(index=self.index, doc_type=self.__type__, body={'query': {'term': {'project': project}}} ).get('count', 0) def get(self, project, taskid, fields=None): ret = self.es.get(index=self.index, doc_type=self.__type__, id="%s:%s" % (project, taskid), _source_include=fields or [], ignore=404) return ret.get('_source', None) def drop(self, project): self.refresh() for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {'term': {'project': project}}}, _source=False): self.es.delete(index=self.index, doc_type=self.__type__, id=record['_id']) def refresh(self): """ Explicitly refresh one or more index, making all operations performed since the last refresh available for search. """ self.es.indices.refresh(index=self.index) PKd6[I/--,weblocust/database/elasticsearch/__init__.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2016-01-17 18:31:58 # Modified on 2016-10-26 20:46:20 PKd6[I@QOO*weblocust/database/elasticsearch/taskdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2016-01-20 20:20:55 # Modified on 2016-10-26 20:46:20 import time import json import elasticsearch.helpers from elasticsearch import Elasticsearch from weblocust.database.base.taskdb import TaskDB as BaseTaskDB class TaskDB(BaseTaskDB): __type__ = 'task' def __init__(self, hosts, index='weblocust'): self.index = index self._changed = False self.es = Elasticsearch(hosts=hosts) self.es.indices.create(index=self.index, ignore=400) if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ "_all": {"enabled": False}, "properties": { "project": {"type": "string", "index": "not_analyzed"}, "status": {"type": "byte"}, } }) def _parse(self, data): if not data: return data for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} return data def _stringify(self, data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = json.dumps(data[each]) return data @property def projects(self): ret = self.es.search(index=self.index, doc_type=self.__type__, body={"aggs": {"projects": { "terms": {"field": "project"} }}}, _source=False) return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])] def load_tasks(self, status, project=None, fields=None): self.refresh() if project is None: for project in self.projects: for each in self.load_tasks(status, project, fields): yield each else: for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {'bool': { 'must': {'term': {'project': project}}, 'should': [{'term': {'status': status}}], 'minimum_should_match': 1, }}}, _source_include=fields or []): yield self._parse(record['_source']) def get_task(self, project, taskid, fields=None): if self._changed: self.refresh() ret = self.es.get(index=self.index, doc_type=self.__type__, id="%s:%s" % (project, taskid), _source_include=fields or [], ignore=404) return self._parse(ret.get('_source', None)) def status_count(self, project): self.refresh() ret = self.es.search(index=self.index, doc_type=self.__type__, body={"query": {'term': {'project': project}}, "aggs": {"status": { "terms": {"field": "status"} }}}, _source=False) result = {} for each in ret['aggregations']['status'].get('buckets', []): result[each['key']] = each['doc_count'] return result def insert(self, project, taskid, obj={}): self._changed = True obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() return self.es.index(index=self.index, doc_type=self.__type__, body=self._stringify(obj), id='%s:%s' % (project, taskid)) def update(self, project, taskid, obj={}, **kwargs): self._changed = True obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self.es.update(index=self.index, doc_type=self.__type__, id='%s:%s' % (project, taskid), body={"doc": self._stringify(obj)}, ignore=404) def drop(self, project): self.refresh() for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {'term': {'project': project}}}, _source=False): self.es.delete(index=self.index, doc_type=self.__type__, id=record['_id']) self.refresh() def refresh(self): """ Explicitly refresh one or more index, making all operations performed since the last refresh available for search. """ self._changed = False self.es.indices.refresh(index=self.index) PKd6[Ifd %weblocust/database/mysql/projectdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-07-17 21:06:43 # Modified on 2016-10-26 20:46:20 import time import mysql.connector from weblocust.database.base.projectdb import ProjectDB as BaseProjectDB from weblocust.database.basedb import BaseDB from .mysqlbase import MySQLMixin class ProjectDB(MySQLMixin, BaseProjectDB, BaseDB): __tablename__ = 'projectdb' def __init__(self, host='localhost', port=3306, database='projectdb', user='root', passwd=None): self.database_name = database self.conn = mysql.connector.connect(user=user, password=passwd, host=host, port=port, autocommit=True) if database not in [x[0] for x in self._execute('show databases')]: self._execute('CREATE DATABASE %s' % self.escape(database)) self.conn.database = database self._execute('''CREATE TABLE IF NOT EXISTS %s ( `name` varchar(64) PRIMARY KEY, `group` varchar(64), `status` varchar(16), `script` TEXT, `comments` varchar(1024), `rate` float(11, 4), `burst` float(11, 4), `updatetime` double(16, 4) ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(self.__tablename__)) def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() return self._insert(**obj) def update(self, name, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj) return ret.rowcount def get_all(self, fields=None): return self._select2dic(what=fields) def get(self, name, fields=None): where = "`name` = %s" % self.placeholder for each in self._select2dic(what=fields, where=where, where_values=(name, )): return each return None def drop(self, name): where = "`name` = %s" % self.placeholder return self._delete(where=where, where_values=(name, )) def check_update(self, timestamp, fields=None): where = "`updatetime` >= %f" % timestamp return self._select2dic(what=fields, where=where) PKd6[IQ4F$weblocust/database/mysql/resultdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-10-13 22:02:57 # Modified on 2016-10-26 20:46:20 import re import six import time import json import mysql.connector from weblocust.libs import utils from weblocust.database.base.resultdb import ResultDB as BaseResultDB from weblocust.database.basedb import BaseDB from .mysqlbase import MySQLMixin, SplitTableMixin class ResultDB(MySQLMixin, SplitTableMixin, BaseResultDB, BaseDB): __tablename__ = '' def __init__(self, host='localhost', port=3306, database='resultdb', user='root', passwd=None): self.database_name = database self.conn = mysql.connector.connect(user=user, password=passwd, host=host, port=port, autocommit=True) if database not in [x[0] for x in self._execute('show databases')]: self._execute('CREATE DATABASE %s' % self.escape(database)) self.conn.database = database self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None tablename = self._tablename(project) if tablename in [x[0] for x in self._execute('show tables')]: return self._execute('''CREATE TABLE %s ( `taskid` varchar(64) PRIMARY KEY, `url` varchar(1024), `result` MEDIUMBLOB, `updatetime` double(16, 4) ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(tablename)) def _parse(self, data): for key, value in list(six.iteritems(data)): if isinstance(value, (bytearray, six.binary_type)): data[key] = utils.text(value) if 'result' in data: data['result'] = json.loads(data['result']) return data def _stringify(self, data): if 'result' in data: data['result'] = json.dumps(data['result']) return data def save(self, project, taskid, url, result): tablename = self._tablename(project) if project not in self.projects: self._create_project(project) self._list_project() obj = { 'taskid': taskid, 'url': url, 'result': result, 'updatetime': time.time(), } return self._replace(tablename, **self._stringify(obj)) def select(self, project, fields=None, offset=0, limit=None): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) for task in self._select2dic(tablename, what=fields, order='updatetime DESC', offset=offset, limit=limit): yield self._parse(task) def count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return 0 tablename = self._tablename(project) for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)): return count def get(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) where = "`taskid` = %s" % self.placeholder for task in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )): return self._parse(task) PKd6[IH++$weblocust/database/mysql/__init__.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-07-17 20:12:54 # Modified on 2016-10-26 20:46:20 PKd6[IC_%weblocust/database/mysql/mysqlbase.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-11-05 10:42:24 # Modified on 2016-10-26 20:46:20 import time import mysql.connector class MySQLMixin(object): @property def dbcur(self): try: if self.conn.unread_result: self.conn.get_rows() return self.conn.cursor() except (mysql.connector.OperationalError, mysql.connector.InterfaceError): self.conn.ping(reconnect=True) self.conn.database = self.database_name return self.conn.cursor() class SplitTableMixin(object): UPDATE_PROJECTS_TIME = 10 * 60 def _tablename(self, project): if self.__tablename__: return '%s_%s' % (self.__tablename__, project) else: return project @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) \ > self.UPDATE_PROJECTS_TIME: self._list_project() return self._projects @projects.setter def projects(self, value): self._projects = value def _list_project(self): self._last_update_projects = time.time() self.projects = set() if self.__tablename__: prefix = '%s_' % self.__tablename__ else: prefix = '' for project, in self._execute('show tables;'): if project.startswith(prefix): project = project[len(prefix):] self.projects.add(project) def drop(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) self._execute("DROP TABLE %s" % self.escape(tablename)) self._list_project() PKd6[IG"weblocust/database/mysql/taskdb.py#!/usr/bin/envutils # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-07-17 18:53:01 # Modified on 2016-10-26 20:46:20 import re import six import time import json import mysql.connector from weblocust.libs import utils from weblocust.database.base.taskdb import TaskDB as BaseTaskDB from weblocust.database.basedb import BaseDB from .mysqlbase import MySQLMixin, SplitTableMixin class TaskDB(MySQLMixin, SplitTableMixin, BaseTaskDB, BaseDB): __tablename__ = '' def __init__(self, host='localhost', port=3306, database='taskdb', user='root', passwd=None): self.database_name = database self.conn = mysql.connector.connect(user=user, password=passwd, host=host, port=port, autocommit=True) if database not in [x[0] for x in self._execute('show databases')]: self._execute('CREATE DATABASE %s' % self.escape(database)) self.conn.database = database self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None tablename = self._tablename(project) if tablename in [x[0] for x in self._execute('show tables')]: return self._execute('''CREATE TABLE IF NOT EXISTS %s ( `taskid` varchar(64) PRIMARY KEY, `project` varchar(64), `url` varchar(1024), `status` int(1), `schedule` BLOB, `fetch` BLOB, `process` BLOB, `track` BLOB, `lastcrawltime` double(16, 4), `updatetime` double(16, 4), INDEX `status_index` (`status`) ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(tablename)) def _parse(self, data): for key, value in list(six.iteritems(data)): if isinstance(value, (bytearray, six.binary_type)): data[key] = utils.text(value) for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} return data def _stringify(self, data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = json.dumps(data[each]) return data def load_tasks(self, status, project=None, fields=None): if project and project not in self.projects: return where = "`status` = %s" % self.placeholder if project: projects = [project, ] else: projects = self.projects for project in projects: tablename = self._tablename(project) for each in self._select2dic( tablename, what=fields, where=where, where_values=(status, ) ): yield self._parse(each) def get_task(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return None where = "`taskid` = %s" % self.placeholder tablename = self._tablename(project) for each in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )): return self._parse(each) return None def status_count(self, project): result = dict() if project not in self.projects: self._list_project() if project not in self.projects: return result tablename = self._tablename(project) for status, count in self._execute("SELECT `status`, count(1) FROM %s GROUP BY `status`" % self.escape(tablename)): result[status] = count return result def insert(self, project, taskid, obj={}): if project not in self.projects: self._list_project() if project not in self.projects: self._create_project(project) self._list_project() obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() tablename = self._tablename(project) return self._insert(tablename, **self._stringify(obj)) def update(self, project, taskid, obj={}, **kwargs): if project not in self.projects: self._list_project() if project not in self.projects: raise LookupError tablename = self._tablename(project) obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self._update( tablename, where="`taskid` = %s" % self.placeholder, where_values=(taskid, ), **self._stringify(obj) ) PKd6[Ievv/weblocust/database/sqlalchemy/sqlalchemybase.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-12-04 18:48:47 # Modified on 2016-10-26 20:46:20 import time def result2dict(columns, task): r = {} for key in task.keys(): r[key] = task[key] return r class SplitTableMixin(object): UPDATE_PROJECTS_TIME = 10 * 60 def _tablename(self, project): if self.__tablename__: return '%s_%s' % (self.__tablename__, project) else: return project @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) \ > self.UPDATE_PROJECTS_TIME: self._list_project() return self._projects @projects.setter def projects(self, value): self._projects = value def _list_project(self): self._last_update_projects = time.time() self.projects = set() if self.__tablename__: prefix = '%s_' % self.__tablename__ else: prefix = '' for project in self.engine.table_names(): if project.startswith(prefix): project = project[len(prefix):] self.projects.add(project) def drop(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return self.table.name = self._tablename(project) self.table.drop(self.engine) self._list_project() PKd6[Ill*weblocust/database/sqlalchemy/projectdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-12-04 23:25:10 # Modified on 2016-10-26 20:46:20 import six import time import sqlalchemy.exc from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, Text from sqlalchemy.engine.url import make_url from weblocust.libs import utils from weblocust.database.base.projectdb import ProjectDB as BaseProjectDB from .sqlalchemybase import result2dict class ProjectDB(BaseProjectDB): __tablename__ = 'projectdb' def __init__(self, url): self.table = Table(self.__tablename__, MetaData(), Column('name', String(64)), Column('group', String(64)), Column('status', String(16)), Column('script', Text), Column('comments', String(1024)), Column('rate', Float(11)), Column('burst', Float(11)), Column('updatetime', Float(32)), mysql_engine='InnoDB', mysql_charset='utf8' ) self.url = make_url(url) if self.url.database: database = self.url.database self.url.database = None try: engine = create_engine(self.url) conn = engine.connect() conn.execute("commit") conn.execute("CREATE DATABASE %s" % database) except sqlalchemy.exc.SQLAlchemyError: pass self.url.database = database self.engine = create_engine(url) self.table.create(self.engine, checkfirst=True) @staticmethod def _parse(data): return data @staticmethod def _stringify(data): return data def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() return self.engine.execute(self.table.insert() .values(**self._stringify(obj))) def update(self, name, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self.engine.execute(self.table.update() .where(self.table.c.name == name) .values(**self._stringify(obj))) def get_all(self, fields=None): columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() .with_only_columns(columns)): yield self._parse(result2dict(columns, task)) def get(self, name, fields=None): columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() .where(self.table.c.name == name) .limit(1) .with_only_columns(columns)): return self._parse(result2dict(columns, task)) def drop(self, name): return self.engine.execute(self.table.delete() .where(self.table.c.name == name)) def check_update(self, timestamp, fields=None): columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() .with_only_columns(columns) .where(self.table.c.updatetime >= timestamp)): yield self._parse(result2dict(columns, task)) PKd6[IlE~)weblocust/database/sqlalchemy/resultdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-12-04 18:48:15 # Modified on 2016-10-26 20:46:20 import re import six import time import json import sqlalchemy.exc from sqlalchemy import (create_engine, MetaData, Table, Column, String, Float, LargeBinary) from sqlalchemy.engine.url import make_url from weblocust.database.base.resultdb import ResultDB as BaseResultDB from weblocust.libs import utils from .sqlalchemybase import SplitTableMixin, result2dict class ResultDB(SplitTableMixin, BaseResultDB): __tablename__ = '' def __init__(self, url): self.table = Table('__tablename__', MetaData(), Column('taskid', String(64), primary_key=True, nullable=False), Column('url', String(1024)), Column('result', LargeBinary), Column('updatetime', Float(32)), mysql_engine='InnoDB', mysql_charset='utf8' ) self.url = make_url(url) if self.url.database: database = self.url.database self.url.database = None try: engine = create_engine(self.url, convert_unicode=True) engine.execute("CREATE DATABASE IF NOT EXISTS %s" % database) except sqlalchemy.exc.SQLAlchemyError: pass self.url.database = database self.engine = create_engine(url, convert_unicode=True) self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None if project in self.projects: return self.table.name = self._tablename(project) self.table.create(self.engine) @staticmethod def _parse(data): for key, value in list(six.iteritems(data)): if isinstance(value, six.binary_type): data[key] = utils.text(value) if 'result' in data: if isinstance(data['result'], bytearray): data['result'] = str(data['result']) data['result'] = json.loads(data['result']) return data @staticmethod def _stringify(data): if 'result' in data: data['result'] = utils.utf8(json.dumps(data['result'])) return data def save(self, project, taskid, url, result): if project not in self.projects: self._create_project(project) self._list_project() self.table.name = self._tablename(project) obj = { 'taskid': taskid, 'url': url, 'result': result, 'updatetime': time.time(), } if self.get(project, taskid, ('taskid', )): del obj['taskid'] return self.engine.execute(self.table.update() .where(self.table.c.taskid == taskid) .values(**self._stringify(obj))) else: return self.engine.execute(self.table.insert() .values(**self._stringify(obj))) def select(self, project, fields=None, offset=0, limit=None): if project not in self.projects: self._list_project() if project not in self.projects: return self.table.name = self._tablename(project) columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() .with_only_columns(columns=columns) .order_by(self.table.c.updatetime.desc()) .offset(offset).limit(limit) .execution_options(autocommit=True)): yield self._parse(result2dict(columns, task)) def count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return 0 self.table.name = self._tablename(project) for count, in self.engine.execute(self.table.count()): return count def get(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return self.table.name = self._tablename(project) columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() .with_only_columns(columns=columns) .where(self.table.c.taskid == taskid) .limit(1)): return self._parse(result2dict(columns, task)) PKd6[I))..)weblocust/database/sqlalchemy/__init__.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-12-04 20:11:04 # Modified on 2016-10-26 20:46:20 PKd6[I2xž``'weblocust/database/sqlalchemy/taskdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-12-04 22:33:43 # Modified on 2016-10-26 20:46:20 import re import six import time import json import sqlalchemy.exc from sqlalchemy import (create_engine, MetaData, Table, Column, Index, Integer, String, Float, LargeBinary, func) from sqlalchemy.engine.url import make_url from weblocust.libs import utils from weblocust.database.base.taskdb import TaskDB as BaseTaskDB from .sqlalchemybase import SplitTableMixin, result2dict class TaskDB(SplitTableMixin, BaseTaskDB): __tablename__ = '' def __init__(self, url): self.table = Table('__tablename__', MetaData(), Column('taskid', String(64), primary_key=True, nullable=False), Column('project', String(64)), Column('url', String(1024)), Column('status', Integer), Column('schedule', LargeBinary), Column('fetch', LargeBinary), Column('process', LargeBinary), Column('track', LargeBinary), Column('lastcrawltime', Float(32)), Column('updatetime', Float(32)), mysql_engine='InnoDB', mysql_charset='utf8' ) self.url = make_url(url) if self.url.database: database = self.url.database self.url.database = None try: engine = create_engine(self.url) conn = engine.connect() conn.execute("commit") conn.execute("CREATE DATABASE %s" % database) except sqlalchemy.exc.SQLAlchemyError: pass self.url.database = database self.engine = create_engine(url) self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None if project in self.projects: return self.table.name = self._tablename(project) Index('status_%s_index' % self.table.name, self.table.c.status) self.table.create(self.engine, checkfirst=True) self.table.indexes.clear() @staticmethod def _parse(data): for key, value in list(six.iteritems(data)): if isinstance(value, six.binary_type): data[key] = utils.text(value) for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: if isinstance(data[each], bytearray): data[each] = str(data[each]) data[each] = json.loads(data[each]) else: data[each] = {} return data @staticmethod def _stringify(data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = utils.utf8(json.dumps(data[each])) return data def load_tasks(self, status, project=None, fields=None): if project and project not in self.projects: return if project: projects = [project, ] else: projects = self.projects columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for project in projects: self.table.name = self._tablename(project) for task in self.engine.execute(self.table.select() .with_only_columns(columns) .where(self.table.c.status == status)): yield self._parse(result2dict(columns, task)) def get_task(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return None self.table.name = self._tablename(project) columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for each in self.engine.execute(self.table.select() .with_only_columns(columns) .limit(1) .where(self.table.c.taskid == taskid)): return self._parse(result2dict(columns, each)) def status_count(self, project): result = dict() if project not in self.projects: self._list_project() if project not in self.projects: return result self.table.name = self._tablename(project) for status, count in self.engine.execute( self.table.select() .with_only_columns((self.table.c.status, func.count(1))) .group_by(self.table.c.status)): result[status] = count return result def insert(self, project, taskid, obj={}): if project not in self.projects: self._list_project() if project not in self.projects: self._create_project(project) self._list_project() obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() self.table.name = self._tablename(project) return self.engine.execute(self.table.insert() .values(**self._stringify(obj))) def update(self, project, taskid, obj={}, **kwargs): if project not in self.projects: self._list_project() if project not in self.projects: raise LookupError self.table.name = self._tablename(project) obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self.engine.execute(self.table.update() .where(self.table.c.taskid == taskid) .values(**self._stringify(obj))) PKd6[IO 9 9 %weblocust/database/local/projectdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2015-01-17 12:32:17 # Modified on 2016-10-26 20:46:20 import os import re import six import glob import logging from weblocust.database.base.projectdb import ProjectDB as BaseProjectDB class ProjectDB(BaseProjectDB): """ProjectDB loading scripts from local file.""" def __init__(self, files): self.files = files self.projects = {} self.load_scripts() def load_scripts(self): project_names = set(self.projects.keys()) for path in self.files: for filename in glob.glob(path): name = os.path.splitext(os.path.basename(filename))[0] if name in project_names: project_names.remove(name) updatetime = os.path.getmtime(filename) if name not in self.projects or updatetime > self.projects[name]['updatetime']: project = self._build_project(filename) if not project: continue self.projects[project['name']] = project for name in project_names: del self.projects[name] rate_re = re.compile(r'^\s*#\s*rate.*?(\d+(\.\d+)?)', re.I | re.M) burst_re = re.compile(r'^\s*#\s*burst.*?(\d+(\.\d+)?)', re.I | re.M) def _build_project(self, filename): try: with open(filename) as fp: script = fp.read() m = self.rate_re.search(script) if m: rate = float(m.group(1)) else: rate = 1 m = self.burst_re.search(script) if m: burst = float(m.group(1)) else: burst = 3 return { 'name': os.path.splitext(os.path.basename(filename))[0], 'group': None, 'status': 'RUNNING', 'script': script, 'comments': None, 'rate': rate, 'burst': burst, 'updatetime': os.path.getmtime(filename), } except OSError as e: logging.error('loading project script error: %s', e) return None def get_all(self, fields=None): for projectname in self.projects: yield self.get(projectname, fields) def get(self, name, fields=None): if name not in self.projects: return None project = self.projects[name] result = {} for f in fields or project: if f in project: result[f] = project[f] else: result[f] = None return result def check_update(self, timestamp, fields=None): self.load_scripts() for projectname, project in six.iteritems(self.projects): if project['updatetime'] > timestamp: yield self.get(projectname, fields) PKd6[IA8--$weblocust/database/local/__init__.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2015-01-17 20:56:50 # Modified on 2016-10-26 20:46:20 PKd6[I'weblocust/database/sqlite/sqlitebase.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-11-22 20:30:44 # Modified on 2016-10-26 20:46:20 import os import time import sqlite3 import threading class SQLiteMixin(object): @property def dbcur(self): pid = (os.getpid(), threading.current_thread().ident) if not (self.conn and pid == self.last_pid): self.last_pid = pid self.conn = sqlite3.connect(self.path, isolation_level=None) return self.conn.cursor() class SplitTableMixin(object): UPDATE_PROJECTS_TIME = 10 * 60 def _tablename(self, project): if self.__tablename__: return '%s_%s' % (self.__tablename__, project) else: return project @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) \ > self.UPDATE_PROJECTS_TIME: self._list_project() return self._projects @projects.setter def projects(self, value): self._projects = value def _list_project(self): self._last_update_projects = time.time() self.projects = set() if self.__tablename__: prefix = '%s_' % self.__tablename__ else: prefix = '' for project, in self._select('sqlite_master', what='name', where='type = "table"'): if project.startswith(prefix): project = project[len(prefix):] self.projects.add(project) def drop(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) self._execute("DROP TABLE %s" % self.escape(tablename)) self._list_project() PKd6[I"6i&weblocust/database/sqlite/projectdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-09 12:05:52 # Modified on 2016-10-26 20:46:20 import time from .sqlitebase import SQLiteMixin from weblocust.database.base.projectdb import ProjectDB as BaseProjectDB from weblocust.database.basedb import BaseDB class ProjectDB(SQLiteMixin, BaseProjectDB, BaseDB): __tablename__ = 'projectdb' placeholder = '?' def __init__(self, path): self.path = path self.last_pid = 0 self.conn = None self._execute('''CREATE TABLE IF NOT EXISTS `%s` ( name PRIMARY KEY, `group`, status, script, comments, rate, burst, updatetime )''' % self.__tablename__) def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() return self._insert(**obj) def update(self, name, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj) return ret.rowcount def get_all(self, fields=None): return self._select2dic(what=fields) def get(self, name, fields=None): where = "`name` = %s" % self.placeholder for each in self._select2dic(what=fields, where=where, where_values=(name, )): return each return None def check_update(self, timestamp, fields=None): where = "`updatetime` >= %f" % timestamp return self._select2dic(what=fields, where=where) def drop(self, name): where = "`name` = %s" % self.placeholder return self._delete(where=where, where_values=(name, )) PKd6[IQ %weblocust/database/sqlite/resultdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-10-13 17:08:43 # Modified on 2016-10-26 20:46:20 import re import time import json from .sqlitebase import SQLiteMixin, SplitTableMixin from weblocust.database.base.resultdb import ResultDB as BaseResultDB from weblocust.database.basedb import BaseDB class ResultDB(SQLiteMixin, SplitTableMixin, BaseResultDB, BaseDB): __tablename__ = 'resultdb' placeholder = '?' def __init__(self, path): self.path = path self.last_pid = 0 self.conn = None self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None tablename = self._tablename(project) self._execute('''CREATE TABLE IF NOT EXISTS `%s` ( taskid PRIMARY KEY, url, result, updatetime )''' % tablename) def _parse(self, data): if 'result' in data: data['result'] = json.loads(data['result']) return data def _stringify(self, data): if 'result' in data: data['result'] = json.dumps(data['result']) return data def save(self, project, taskid, url, result): tablename = self._tablename(project) if project not in self.projects: self._create_project(project) self._list_project() obj = { 'taskid': taskid, 'url': url, 'result': result, 'updatetime': time.time(), } return self._replace(tablename, **self._stringify(obj)) def select(self, project, fields=None, offset=0, limit=None): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) for task in self._select2dic(tablename, what=fields, order='updatetime DESC', offset=offset, limit=limit): yield self._parse(task) def count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return 0 tablename = self._tablename(project) for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)): return count def get(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) where = "`taskid` = %s" % self.placeholder for task in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )): return self._parse(task) PKd6[I%weblocust/database/sqlite/__init__.pyPKd6[Ih)((#weblocust/database/sqlite/taskdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-08 10:25:34 # Modified on 2016-10-26 20:46:20 import re import time import json from .sqlitebase import SQLiteMixin, SplitTableMixin from weblocust.database.base.taskdb import TaskDB as BaseTaskDB from weblocust.database.basedb import BaseDB class TaskDB(SQLiteMixin, SplitTableMixin, BaseTaskDB, BaseDB): __tablename__ = 'taskdb' placeholder = '?' def __init__(self, path): self.path = path self.last_pid = 0 self.conn = None self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None tablename = self._tablename(project) self._execute('''CREATE TABLE IF NOT EXISTS `%s` ( taskid PRIMARY KEY, project, url, status, schedule, fetch, process, track, lastcrawltime, updatetime )''' % tablename) self._execute( '''CREATE INDEX `status_%s_index` ON %s (status)''' % (tablename, self.escape(tablename)) ) def _parse(self, data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} return data def _stringify(self, data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = json.dumps(data[each]) return data def load_tasks(self, status, project=None, fields=None): if project and project not in self.projects: return where = "status = %d" % status if project: projects = [project, ] else: projects = self.projects for project in projects: tablename = self._tablename(project) for each in self._select2dic(tablename, what=fields, where=where): yield self._parse(each) def get_task(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return None where = "`taskid` = %s" % self.placeholder if project not in self.projects: return None tablename = self._tablename(project) for each in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )): return self._parse(each) return None def status_count(self, project): ''' return a dict ''' result = dict() if project not in self.projects: self._list_project() if project not in self.projects: return result tablename = self._tablename(project) for status, count in self._execute("SELECT `status`, count(1) FROM %s GROUP BY `status`" % self.escape(tablename)): result[status] = count return result def insert(self, project, taskid, obj={}): if project not in self.projects: self._create_project(project) self._list_project() obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() tablename = self._tablename(project) return self._insert(tablename, **self._stringify(obj)) def update(self, project, taskid, obj={}, **kwargs): if project not in self.projects: raise LookupError tablename = self._tablename(project) obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self._update( tablename, where="`taskid` = %s" % self.placeholder, where_values=(taskid, ), **self._stringify(obj) ) PKd6[I~@ @ 'weblocust/database/mongodb/projectdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-10-12 12:22:42 # Modified on 2016-10-26 20:46:20 import time from pymongo import MongoClient from weblocust.database.base.projectdb import ProjectDB as BaseProjectDB class ProjectDB(BaseProjectDB): __collection_name__ = 'projectdb' def __init__(self, url, database='projectdb'): self.conn = MongoClient(url) self.conn.admin.command("ismaster") self.database = self.conn[database] self.collection = self.database[self.__collection_name__] self.collection.ensure_index('name', unique=True) def _default_fields(self, each): if each is None: return each each.setdefault('group', None) each.setdefault('status', 'TODO') each.setdefault('script', '') each.setdefault('comments', None) each.setdefault('rate', 0) each.setdefault('burst', 0) each.setdefault('updatetime', 0) return each def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() return self.collection.update({'name': name}, {'$set': obj}, upsert=True) def update(self, name, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self.collection.update({'name': name}, {'$set': obj}) def get_all(self, fields=None): for each in self.collection.find({}, fields): if each and '_id' in each: del each['_id'] yield self._default_fields(each) def get(self, name, fields=None): each = self.collection.find_one({'name': name}, fields) if each and '_id' in each: del each['_id'] return self._default_fields(each) def check_update(self, timestamp, fields=None): for project in self.get_all(fields=('updatetime', 'name')): if project['updatetime'] > timestamp: project = self.get(project['name'], fields) yield self._default_fields(project) def drop(self, name): return self.collection.remove({'name': name}) PKd6[IH%&weblocust/database/mongodb/resultdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-10-13 22:18:36 # Modified on 2016-10-26 20:46:20 import json import time from pymongo import MongoClient import pymongo from weblocust.database.base.resultdb import ResultDB as BaseResultDB from .mongodbbase import SplitTableMixin class ResultDB(SplitTableMixin, BaseResultDB): collection_prefix = '' def __init__(self, url, database='resultdb'): self.conn = MongoClient(url) self.conn.admin.command("ismaster") self.database = self.conn[database] self.projects = set() self._list_project() for project in self.projects: collection_name = self._collection_name(project) #self.database[collection_name].ensure_index('taskid') #self.database[collection_name].create_index('updatetime') self.ensure_index(collection_name) def _parse(self, data): data['_id'] = str(data['_id']) if 'result' in data: #data['result'] = json.loads(data['result']) pass return data def _stringify(self, data): """ it is in mongodb,why stringify this? """ return data if 'result' in data: data['result'] = json.dumps(data['result']) return data def save(self, project, taskid, url, result): collection_name = self._collection_name(project) obj = { 'taskid': taskid, 'url': url, 'result': result, 'updatetime': time.time(), } return self.database[collection_name].update( {'taskid': taskid}, {"$set": self._stringify(obj)}, upsert=True ) ## # we have one2many relationship sometimes, add by qiulimao@2016.05.21 ## def asave(self,project,taskid,url,result): """ db.book.update({'user':'body'}, {'$addToSet':{books:{'$each':['心经','楞严经','阿弥陀佛经','金刚经']}}); """ collection_name = self._collection_name(project) extraid = result.get("__extraid__") refer = result.get("__refer__") obj = { 'taskid': taskid, 'extraid':extraid if extraid else "__main__", 'refer':refer if refer else "__self__", 'url': url, 'result':result, 'updatetime': time.time(), } return self.database[collection_name].update({'taskid': taskid,'extraid':extraid}, {"$set":obj}, upsert=True) def select(self, project, fields=None, offset=0, limit=0): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) results = self.database[collection_name].find({}, fields, skip=offset, limit=limit).sort("updatetime",pymongo.DESCENDING) for result in results: yield self._parse(result) ## # select by condition ## def select_by(self,project,condition={},offset=0,limit=0): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) results = self.database[collection_name].find(condition, skip=offset, limit=limit).sort("updatetime",pymongo.DESCENDING) for result in results: yield self._parse(result) def count_by(self,project,condition={}): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) return self.database[collection_name].find(condition).count() def remove(self,project): """ remove all the results in result database """ if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) return self.database[collection_name].remove() def size(self,project): """ return the size of result database """ return self.count(project) ## # ------------------------------------------------------------------------ ## def count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) return self.database[collection_name].count() def get(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) ret = self.database[collection_name].find_one({'taskid': taskid}, fields) if not ret: return ret return self._parse(ret) def ensure_index(self,collection_name): # 因为result 的索引实际上的taskid,所以这里建taskid的索引 self.database[collection_name].create_index([ ("taskid", pymongo.ASCENDING), ("extraid", pymongo.ASCENDING) ]) self.database[collection_name].create_index('updatetime') self.database[collection_name].create_index([ ('refer',pymongo.DESCENDING), ("updatetime",pymongo.DESCENDING) ]) PKd6[I&weblocust/database/mongodb/__init__.pyPKd6[I+hII)weblocust/database/mongodb/mongodbbase.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-11-22 20:42:01 # Modified on 2016-10-26 20:46:20 import time class SplitTableMixin(object): UPDATE_PROJECTS_TIME = 10 * 60 def _collection_name(self, project): if self.collection_prefix: return "%s.%s" % (self.collection_prefix, project) else: return project @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) \ > self.UPDATE_PROJECTS_TIME: self._list_project() return self._projects @projects.setter def projects(self, value): self._projects = value def _list_project(self): self._last_update_projects = time.time() self.projects = set() if self.collection_prefix: prefix = "%s." % self.collection_prefix else: prefix = '' for each in self.database.collection_names(): if each.startswith('system.'): continue if each.startswith(prefix): self.projects.add(each[len(prefix):]) def drop(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) self.database[collection_name].drop() self._list_project() PKd6[I[99$weblocust/database/mongodb/taskdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-10-11 23:54:50 # Modified on 2016-10-26 20:46:20 import json import time from pymongo import MongoClient from weblocust.database.base.taskdb import TaskDB as BaseTaskDB from .mongodbbase import SplitTableMixin class TaskDB(SplitTableMixin, BaseTaskDB): collection_prefix = '' def __init__(self, url, database='taskdb'): self.conn = MongoClient(url) self.conn.admin.command("ismaster") self.database = self.conn[database] self.projects = set() self._list_project() for project in self.projects: collection_name = self._collection_name(project) self.database[collection_name].ensure_index('status') self.database[collection_name].ensure_index('taskid') def _parse(self, data): if '_id' in data: del data['_id'] ## # comment out by qiulimao@2016.05 ## #for each in ('schedule', 'fetch', 'process', 'track'): # if each in data: # if data[each]: # if isinstance(data[each], bytearray): # data[each] = str(data[each]) # data[each] = json.loads(data[each], 'utf8') # else: # data[each] = {} return data def _stringify(self, data): """ It is in mongodb,we can save json derectly """ return data ## # we don't need it in mongo db ## for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = json.dumps(data[each]) return data def remove(self,project): """ remove all tasks in taskdb """ if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) #ret = self.database[collection_name].find_one({'taskid': taskid}, fields) self.database[collection_name].remove() def size(self,project): """ get the size of the tasks in taskdb """ if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) ret = self.database[collection_name].find().count() return ret def load_tasks(self, status, project=None, fields=None): if not project: self._list_project() if project: projects = [project, ] else: projects = self.projects for project in projects: collection_name = self._collection_name(project) for task in self.database[collection_name].find({'status': status}, fields): yield self._parse(task) def get_task(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) ret = self.database[collection_name].find_one({'taskid': taskid}, fields) if not ret: return ret return self._parse(ret) def status_count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return {} collection_name = self._collection_name(project) ret = self.database[collection_name].aggregate([ {'$group': { '_id': '$status', 'total': { '$sum': 1 } } }]) result = {} if isinstance(ret, dict): ret = ret.get('result', []) for each in ret: result[each['_id']] = each['total'] return result def insert(self, project, taskid, obj={}): obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() return self.update(project, taskid, obj=obj) def update(self, project, taskid, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() collection_name = self._collection_name(project) return self.database[collection_name].update( {'taskid': taskid}, {"$set": self._stringify(obj)}, upsert=True ) def ensure_index(self,collection_name): self.database[collection_name].ensure_index('status') self.database[collection_name].ensure_index('taskid') PKd6[I~..$weblocust/database/redis/__init__.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2015-05-17 01:34:21 # Modified on 2016-10-26 20:46:20 PKd6[Ia  "weblocust/database/redis/taskdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2015-05-16 21:01:52 # Modified on 2016-10-26 20:46:20 import six import time import json import redis import logging import itertools from weblocust.libs import utils from weblocust.database.base.taskdb import TaskDB as BaseTaskDB class TaskDB(BaseTaskDB): UPDATE_PROJECTS_TIME = 10 * 60 __prefix__ = 'taskdb_' def __init__(self, host='localhost', port=6379, db=0): self.redis = redis.StrictRedis(host=host, port=port, db=db) try: self.redis.scan(count=1) self.scan_available = True except Exception as e: logging.debug("redis_scan disabled: %r", e) self.scan_available = False def _gen_key(self, project, taskid): return "%s%s_%s" % (self.__prefix__, project, taskid) def _gen_status_key(self, project, status): return '%s%s_status_%d' % (self.__prefix__, project, status) def _parse(self, data): if six.PY3: result = {} for key, value in data.items(): if isinstance(value, bytes): value = utils.text(value) result[utils.text(key)] = value data = result for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} if 'status' in data: data['status'] = int(data['status']) if 'lastcrawltime' in data: data['lastcrawltime'] = float(data['lastcrawltime'] or 0) if 'updatetime' in data: data['updatetime'] = float(data['updatetime'] or 0) return data def _stringify(self, data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = json.dumps(data[each]) return data @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) \ > self.UPDATE_PROJECTS_TIME: self._projects = set(utils.text(x) for x in self.redis.smembers( self.__prefix__ + 'projects')) return self._projects def load_tasks(self, status, project=None, fields=None): if project is None: project = self.projects elif not isinstance(project, list): project = [project, ] if self.scan_available: scan_method = self.redis.sscan_iter else: scan_method = self.redis.smembers if fields: def get_method(key): obj = self.redis.hmget(key, fields) if all(x is None for x in obj): return None return dict(zip(fields, obj)) else: get_method = self.redis.hgetall for p in project: status_key = self._gen_status_key(p, status) for taskid in scan_method(status_key): obj = get_method(self._gen_key(p, utils.text(taskid))) if not obj: #self.redis.srem(status_key, taskid) continue else: yield self._parse(obj) def get_task(self, project, taskid, fields=None): if fields: obj = self.redis.hmget(self._gen_key(project, taskid), fields) if all(x is None for x in obj): return None obj = dict(zip(fields, obj)) else: obj = self.redis.hgetall(self._gen_key(project, taskid)) if not obj: return None return self._parse(obj) def status_count(self, project): ''' return a dict ''' pipe = self.redis.pipeline(transaction=False) for status in range(1, 5): pipe.scard(self._gen_status_key(project, status)) ret = pipe.execute() result = {} for status, count in enumerate(ret): if count > 0: result[status + 1] = count return result def insert(self, project, taskid, obj={}): obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() obj.setdefault('status', self.ACTIVE) task_key = self._gen_key(project, taskid) pipe = self.redis.pipeline(transaction=False) if project not in self.projects: pipe.sadd(self.__prefix__ + 'projects', project) pipe.hmset(task_key, self._stringify(obj)) pipe.sadd(self._gen_status_key(project, obj['status']), taskid) pipe.execute() def update(self, project, taskid, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() pipe = self.redis.pipeline(transaction=False) pipe.hmset(self._gen_key(project, taskid), self._stringify(obj)) if 'status' in obj: for status in range(1, 5): if status == obj['status']: pipe.sadd(self._gen_status_key(project, status), taskid) else: pipe.srem(self._gen_status_key(project, status), taskid) pipe.execute() def drop(self, project): self.redis.srem(self.__prefix__ + 'projects', project) if self.scan_available: scan_method = self.redis.scan_iter else: scan_method = self.redis.keys for each in itertools.tee(scan_method("%s%s_*" % (self.__prefix__, project)), 100): each = list(each) if each: self.redis.delete(*each) PKd6[In8ww$weblocust/database/base/projectdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-09 11:28:52 # Modified on 2016-10-26 20:46:20 import re # NOTE: When get/get_all/check_update from database with default fields, # all following fields should be included in output dict. { 'project': { 'name': str, 'group': str, 'status': str, 'script': str, # 'config': str, 'comments': str, # 'priority': int, 'rate': int, 'burst': int, 'updatetime': int, } } class ProjectDB(object): status_str = [ 'TODO', 'STOP', 'CHECKING', 'DEBUG', 'RUNNING', ] def insert(self, name, obj={}): raise NotImplementedError def update(self, name, obj={}, **kwargs): raise NotImplementedError def get_all(self, fields=None): raise NotImplementedError def get(self, name, fields): raise NotImplementedError def drop(self, name): raise NotImplementedError def check_update(self, timestamp, fields=None): raise NotImplementedError def split_group(self, group, lower=True): return re.split("\W+", (group or '').lower()) def verify_project_name(self, name): if len(name) > 64: return False if re.search(r"[^\w]", name): return False return True def copy(self): ''' database should be able to copy itself to create new connection it's implemented automatically by weblocust.database.connect_database if you are not create database connection via connect_database method, you should implement this ''' raise NotImplementedError PKd6[I:MLtt#weblocust/database/base/resultdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-10-11 18:40:03 # Modified on 2016-10-26 20:46:20 # result schema { 'result': { 'taskid': str, # new, not changeable 'project': str, # new, not changeable 'url': str, # new, not changeable 'result': str, # json string 'updatetime': int, } } class ResultDB(object): """ database for result """ projects = set() # projects in resultdb def save(self, project, taskid, url, result): raise NotImplementedError def select(self, project, fields=None, offset=0, limit=None): raise NotImplementedError def count(self, project): raise NotImplementedError def get(self, project, taskid, fields=None): raise NotImplementedError def drop(self, project): raise NotImplementedError def copy(self): ''' database should be able to copy itself to create new connection it's implemented automatically by weblocust.database.connect_database if you are not create database connection via connect_database method, you should implement this ''' raise NotImplementedError PKd6[I#weblocust/database/base/__init__.pyPKd6[ID !weblocust/database/base/taskdb.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-08 10:28:48 # Modified on 2016-10-26 20:46:20 # task schema { 'task': { 'taskid': str, # new, not change 'project': str, # new, not change 'url': str, # new, not change 'status': int, # change 'schedule': { 'priority': int, 'retries': int, 'retried': int, 'exetime': int, 'age': int, 'itag': str, # 'recrawl': int }, # new and restart 'fetch': { 'method': str, 'headers': dict, 'data': str, 'timeout': int, 'save': dict, }, # new and restart 'process': { 'callback': str, }, # new and restart 'track': { 'fetch': { 'ok': bool, 'time': int, 'status_code': int, 'headers': dict, 'encoding': str, 'content': str, }, 'process': { 'ok': bool, 'time': int, 'follows': int, 'outputs': int, 'logs': str, 'exception': str, }, 'save': object, # jsonable object saved by processor }, # finish 'lastcrawltime': int, # keep between request 'updatetime': int, # keep between request } } class TaskDB(object): ACTIVE = 1 SUCCESS = 2 FAILED = 3 BAD = 4 projects = set() # projects in taskdb def load_tasks(self, status, project=None, fields=None): raise NotImplementedError def get_task(self, project, taskid, fields=None): raise NotImplementedError def status_count(self, project): ''' return a dict ''' raise NotImplementedError def insert(self, project, taskid, obj={}): raise NotImplementedError def update(self, project, taskid, obj={}, **kwargs): raise NotImplementedError def drop(self, project): raise NotImplementedError @staticmethod def status_to_string(status): return { 1: 'ACTIVE', 2: 'SUCCESS', 3: 'FAILED', 4: 'BAD', }.get(status, 'UNKNOWN') @staticmethod def status_to_int(status): return { 'ACTIVE': 1, 'SUCCESS': 2, 'FAILED': 3, 'BAD': 4, }.get(status, 4) def copy(self): ''' database should be able to copy itself to create new connection it's implemented automatically by weblocust.database.connect_database if you are not create database connection via connect_database method, you should implement this ''' raise NotImplementedError PKd6[I!bweblocust/libs/dataurl.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2012-11-16 10:33:20 # Modified on 2016-10-26 20:46:20 import six from base64 import b64encode, b64decode from . import utils from six.moves.urllib.parse import quote, unquote def encode(data, mime_type='', charset='utf-8', base64=True): """ Encode data to DataURL """ if isinstance(data, six.text_type): data = data.encode(charset) else: charset = None if base64: data = utils.text(b64encode(data)) else: data = utils.text(quote(data)) result = ['data:', ] if mime_type: result.append(mime_type) if charset: result.append(';charset=') result.append(charset) if base64: result.append(';base64') result.append(',') result.append(data) return ''.join(result) def decode(data_url): """ Decode DataURL data """ metadata, data = data_url.rsplit(',', 1) _, metadata = metadata.split('data:', 1) parts = metadata.split(';') if parts[-1] == 'base64': data = b64decode(data) else: data = unquote(data) for part in parts: if part.startswith("charset="): data = data.decode(part[8:]) return data PKd6[I d==weblocust/libs/useragent.py#coding:utf-8 IphoneSafari = "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3" LinuxChrome = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36"PKd6[Ivmweblocust/libs/log.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2012-10-24 16:08:17 # Modified on 2016-10-26 20:46:20 import logging try: import curses except ImportError: curses = None from tornado.log import LogFormatter as _LogFormatter class LogFormatter(_LogFormatter, object): """Init tornado.log.LogFormatter from logging.config.fileConfig""" def __init__(self, fmt=None, datefmt=None, color=True, *args, **kwargs): if fmt is None: fmt = _LogFormatter.DEFAULT_FORMAT super(LogFormatter, self).__init__(color=color, fmt=fmt, *args, **kwargs) class SaveLogHandler(logging.Handler): """LogHandler that save records to a list""" def __init__(self, saveto=None, *args, **kwargs): self.saveto = saveto logging.Handler.__init__(self, *args, **kwargs) def emit(self, record): if self.saveto is not None: self.saveto.append(record) handle = emit def enable_pretty_logging(logger=logging.getLogger()): channel = logging.StreamHandler() channel.setFormatter(LogFormatter()) logger.addHandler(channel) PKd6[ISV11weblocust/libs/pprint.py# Author: Fred L. Drake, Jr. # fdrake@... # # This is a simple little module I wrote to make life easier. I didn't # see anything quite like it in the library, though I may have overlooked # something. I wrote this when I was trying to read some heavily nested # tuples with fairly non-descriptive content. This is modeled very much # after Lisp/Scheme - style pretty-printing of lists. If you find it # useful, thank small children who sleep at night. """Support to pretty-print lists, tuples, & dictionaries recursively. Very simple, but useful, especially in debugging data structures. Classes ------- PrettyPrinter() Handle pretty-printing operations onto a stream using a configured set of formatting parameters. Functions --------- pformat() Format a Python object into a pretty-printed representation. pprint() Pretty-print a Python object to a stream [default is sys.stdout]. saferepr() Generate a 'standard' repr()-like value, but protect against recursive data structures. """ from __future__ import print_function import six import sys as _sys from io import BytesIO, StringIO __all__ = ["pprint", "pformat", "isreadable", "isrecursive", "saferepr", "PrettyPrinter"] # cache these for faster access: _commajoin = ", ".join _id = id _len = len _type = type def pprint(object, stream=None, indent=1, width=80, depth=None): """Pretty-print a Python object to a stream [default is sys.stdout].""" printer = PrettyPrinter( stream=stream, indent=indent, width=width, depth=depth) printer.pprint(object) def pformat(object, indent=1, width=80, depth=None): """Format a Python object into a pretty-printed representation.""" return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object) def saferepr(object): """Version of repr() which can handle recursive data structures.""" return _safe_repr(object, {}, None, 0)[0] def isreadable(object): """Determine if saferepr(object) is readable by eval().""" return _safe_repr(object, {}, None, 0)[1] def isrecursive(object): """Determine if object requires a recursive representation.""" return _safe_repr(object, {}, None, 0)[2] def _sorted(iterable): return sorted(iterable) class PrettyPrinter: def __init__(self, indent=1, width=80, depth=None, stream=None): """Handle pretty printing operations onto a stream using a set of configured parameters. indent Number of spaces to indent for each level of nesting. width Attempted maximum number of columns in the output. depth The maximum depth to print out nested structures. stream The desired output stream. If omitted (or false), the standard output stream available at construction will be used. """ indent = int(indent) width = int(width) assert indent >= 0, "indent must be >= 0" assert depth is None or depth > 0, "depth must be > 0" assert width, "width must be != 0" self._depth = depth self._indent_per_level = indent self._width = width if stream is not None: self._stream = stream else: self._stream = _sys.stdout def pprint(self, object): self._format(object, self._stream, 0, 0, {}, 0) self._stream.write("\n") def pformat(self, object): sio = BytesIO() self._format(object, sio, 0, 0, {}, 0) return sio.getvalue() def isrecursive(self, object): return self.format(object, {}, 0, 0)[2] def isreadable(self, object): s, readable, recursive = self.format(object, {}, 0, 0) return readable and not recursive def _format(self, object, stream, indent, allowance, context, level): level = level + 1 objid = _id(object) if objid in context: stream.write(_recursion(object)) self._recursive = True self._readable = False return rep = self._repr(object, context, level - 1) typ = _type(object) sepLines = _len(rep) > (self._width - 1 - indent - allowance) write = stream.write if self._depth and level > self._depth: write(rep) return r = getattr(typ, "__repr__", None) if issubclass(typ, dict) and r is dict.__repr__: write('{') if self._indent_per_level > 1: write((self._indent_per_level - 1) * ' ') length = _len(object) if length: context[objid] = 1 indent = indent + self._indent_per_level items = _sorted(object.items()) key, ent = items[0] rep = self._repr(key, context, level) write(rep) write(': ') self._format(ent, stream, indent + _len(rep) + 2, allowance + 1, context, level) if length > 1: for key, ent in items[1:]: rep = self._repr(key, context, level) if sepLines: write(',\n%s%s: ' % (' ' * indent, rep)) else: write(', %s: ' % rep) self._format(ent, stream, indent + _len(rep) + 2, allowance + 1, context, level) indent = indent - self._indent_per_level del context[objid] write('}') return if ( (issubclass(typ, list) and r is list.__repr__) or (issubclass(typ, tuple) and r is tuple.__repr__) or (issubclass(typ, set) and r is set.__repr__) or (issubclass(typ, frozenset) and r is frozenset.__repr__) ): length = _len(object) if issubclass(typ, list): write('[') endchar = ']' elif issubclass(typ, set): if not length: write('set()') return write('set([') endchar = '])' object = _sorted(object) indent += 4 elif issubclass(typ, frozenset): if not length: write('frozenset()') return write('frozenset([') endchar = '])' object = _sorted(object) indent += 10 else: write('(') endchar = ')' if self._indent_per_level > 1 and sepLines: write((self._indent_per_level - 1) * ' ') if length: context[objid] = 1 indent = indent + self._indent_per_level self._format(object[0], stream, indent, allowance + 1, context, level) if length > 1: for ent in object[1:]: if sepLines: write(',\n' + ' ' * indent) else: write(', ') self._format(ent, stream, indent, allowance + 1, context, level) indent = indent - self._indent_per_level del context[objid] if issubclass(typ, tuple) and length == 1: write(',') write(endchar) return write(rep) def _repr(self, object, context, level): repr, readable, recursive = self.format(object, context.copy(), self._depth, level) if not readable: self._readable = False if recursive: self._recursive = True return repr def format(self, object, context, maxlevels, level): """Format object for a specific context, returning a string and flags indicating whether the representation is 'readable' and whether the object represents a recursive construct. """ return _safe_repr(object, context, maxlevels, level) # Return triple (repr_string, isreadable, isrecursive). def _safe_repr(object, context, maxlevels, level): typ = _type(object) if typ is str: string = object string = string.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t') if 'locale' not in _sys.modules: return repr(object), True, False if "'" in object and '"' not in object: closure = '"' quotes = {'"': '\\"'} string = string.replace('"', '\\"') else: closure = "'" quotes = {"'": "\\'"} string = string.replace("'", "\\'") try: string.decode('utf8').encode('gbk', 'replace') return ("%s%s%s" % (closure, string, closure)), True, False except: pass qget = quotes.get sio = StringIO() write = sio.write for char in object: if char.isalpha(): write(char) else: write(qget(char, repr(char)[1:-1])) return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False if typ is six.text_type: string = object.encode("utf8", 'replace') string = string.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t') if "'" in object and '"' not in object: closure = '"' quotes = {'"': '\\"'} string = string.replace('"', '\\"') else: closure = "'" quotes = {"'": "\\'"} string = string.replace("'", "\\'") return ("u%s%s%s" % (closure, string, closure)), True, False r = getattr(typ, "__repr__", None) if issubclass(typ, dict) and r is dict.__repr__: if not object: return "{}", True, False objid = _id(object) if maxlevels and level >= maxlevels: return "{...}", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 saferepr = _safe_repr for k, v in _sorted(object.items()): krepr, kreadable, krecur = saferepr(k, context, maxlevels, level) vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level) append("%s: %s" % (krepr, vrepr)) readable = readable and kreadable and vreadable if krecur or vrecur: recursive = True del context[objid] return "{%s}" % _commajoin(components), readable, recursive if (issubclass(typ, list) and r is list.__repr__) or \ (issubclass(typ, tuple) and r is tuple.__repr__): if issubclass(typ, list): if not object: return "[]", True, False format = "[%s]" elif _len(object) == 1: format = "(%s,)" else: if not object: return "()", True, False format = "(%s)" objid = _id(object) if maxlevels and level >= maxlevels: return format % "...", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 for o in object: orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level) append(orepr) if not oreadable: readable = False if orecur: recursive = True del context[objid] return format % _commajoin(components), readable, recursive rep = repr(object) return rep, (rep and not rep.startswith('<')), False def _recursion(object): return ("" % (_type(object).__name__, _id(object))) def _perfcheck(object=None): import time if object is None: object = [("string", (1, 2), [3, 4], {5: 6, 7: 8})] * 100000 p = PrettyPrinter() t1 = time.time() _safe_repr(object, {}, None, 0) t2 = time.time() p.pformat(object) t3 = time.time() print("_safe_repr:", t2 - t1) print("pformat:", t3 - t2) if __name__ == "__main__": _perfcheck() PKd6[I/: weblocust/libs/bench.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-12-08 22:23:10 # Modified on 2016-10-26 20:46:20 import time import logging logger = logging.getLogger('bench') from six.moves import queue as Queue from weblocust.scheduler import ThreadBaseScheduler as Scheduler from weblocust.fetcher.tornado_fetcher import Fetcher from weblocust.processor import Processor from weblocust.result import ResultWorker from weblocust.libs.utils import md5string def bench_test_taskdb(taskdb): project_name = '__bench_test__' task = { "fetch": { "fetch_type": "js", "headers": { "User-Agent": "BaiDuSpider" } }, "process": { "callback": "detail_page" }, "project": project_name, "taskid": "553300d2582154413b4982c00c34a2d5", "url": "http://www.sciencedirect.com/science/article/pii/S1674200109000704" } track = { "fetch": { "content": None, "encoding": "unicode", "error": None, "headers": { "last-modified": "Wed, 04 Mar 2015 09:24:33 GMT" }, "ok": True, "redirect_url": None, "status_code": 200, "time": 5.543 }, "process": { "exception": None, "follows": 4, "logs": "", "ok": True, "result": "{'url': u'", "time": 0.07105398178100586 } } def test_insert(n, start=0): logger.info("taskdb insert %d", n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.weblocust.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = {} taskdb.insert(task['project'], task['taskid'], task) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000) def test_update(n, start=0): logger.info("taskdb update %d" % n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.weblocust.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = track taskdb.update(task['project'], task['taskid'], task) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000) request_task_fields = [ 'taskid', 'project', 'url', 'status', 'fetch', 'process', 'track', 'lastcrawltime' ] def test_get(n, start=0, random=True, fields=request_task_fields): logger.info("taskdb get %d %s" % (n, "randomly" if random else "")) range_n = list(range(n)) if random: from random import shuffle shuffle(range_n) start_time = time.time() for i in range_n: task['url'] = 'http://bench.weblocust.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = track taskdb.get_task(task['project'], task['taskid'], fields=fields) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000) try: test_insert(1000) test_update(1000) test_get(1000) test_insert(10000, 1000) test_update(10000, 1000) test_get(10000, 1000) except Exception as e: logger.exception(e) finally: taskdb.drop(project_name) def bench_test_message_queue(queue): task = { "fetch": { "fetch_type": "js", "headers": { "User-Agent": "BaiDuSpider" } }, "process": { "callback": "detail_page" }, "project": "__bench_test__", "taskid": "553300d2582154413b4982c00c34a2d5", "url": "http://www.sciencedirect.com/science/article/pii/S1674200109000704" } def test_put(n): logger.info("message queue put %d", n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.weblocust.org/?l=%d' % i task['taskid'] = md5string(task['url']) queue.put(task, block=True, timeout=1) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000) def test_get(n): logger.info("message queue get %d", n) start_time = time.time() for i in range(n): try: queue.get(True, 1) except Queue.Empty: logger.error('message queue empty while get %d', i) raise end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000) try: test_put(1000) test_get(1000) test_put(10000) test_get(10000) except Exception as e: logger.exception(e) finally: if hasattr(queue, 'channel'): queue.channel.queue_purge(queue.name) class BenchMixin(object): """Report to logger for bench test""" def _bench_init(self): self.done_cnt = 0 self.start_time = time.time() self.last_cnt = 0 self.last_report = 0 def _bench_report(self, name, prefix=0, rjust=0): self.done_cnt += 1 now = time.time() if now - self.last_report >= 1: rps = float(self.done_cnt - self.last_cnt) / (now - self.last_report) output = '' if prefix: output += " " * prefix output += ("%s %s pages (at %d pages/min)" % ( name, self.done_cnt, rps * 60.0)).rjust(rjust) logger.info(output) self.last_cnt = self.done_cnt self.last_report = now class BenchScheduler(Scheduler, BenchMixin): def __init__(self, *args, **kwargs): super(BenchScheduler, self).__init__(*args, **kwargs) self._bench_init() self.trigger_on_start('__bench_test__') def on_task_status(self, task): self._bench_report('Crawled') return super(BenchScheduler, self).on_task_status(task) class BenchFetcher(Fetcher, BenchMixin): def __init__(self, *args, **kwargs): super(BenchFetcher, self).__init__(*args, **kwargs) self._bench_init() def on_result(self, type, task, result): self._bench_report("Fetched", 0, 75) return super(BenchFetcher, self).on_result(type, task, result) class BenchProcessor(Processor, BenchMixin): def __init__(self, *args, **kwargs): super(BenchProcessor, self).__init__(*args, **kwargs) self._bench_init() def on_task(self, task, response): self._bench_report("Processed", 75) return super(BenchProcessor, self).on_task(task, response) class BenchResultWorker(ResultWorker, BenchMixin): def __init__(self, *args, **kwargs): super(BenchResultWorker, self).__init__(*args, **kwargs) self._bench_init() def on_result(self, task, result): self._bench_report("Saved", 0, 150) super(BenchResultWorker, self).on_result(task, result) bench_script = ''' from weblocust.libs.base_handler import * class Handler(BaseHandler): def on_start(self): self.crawl('http://127.0.0.1:5000/bench', params={'total': %(total)d, 'show': %(show)d}, callback=self.index_page) def index_page(self, response): for each in response.doc('a[href^="http://"]').items(): self.crawl(each.attr.href, callback=self.index_page) return response.url ''' PK\ ]I0C0Cweblocust/libs/base_handler.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-16 23:12:48 # Modified on 2016-10-26 20:46:20 import sys import inspect import functools import fractions import six from six import add_metaclass, iteritems from weblocust.libs.url import ( quote_chinese, _build_url, _encode_params, _encode_multipart_formdata, curl_to_arguments) from weblocust.libs.utils import md5string from weblocust.libs.ListIO import ListO from weblocust.libs.response import rebuild_response from weblocust.libs.pprint import pprint from weblocust.processor import ProcessorResult def catch_status_code_error(func): """ Non-200 response will been regarded as fetch failed and will not pass to callback. Use this decorator to override this feature. """ func._catch_status_code_error = True return func def not_send_status(func): """ Do not send process status package back to scheduler. It's used by callbacks like on_message, on_result etc... """ @functools.wraps(func) def wrapper(self, response, task): self._extinfo['not_send_status'] = True function = func.__get__(self, self.__class__) return self._run_func(function, response, task) return wrapper def config(_config=None, **kwargs): """ A decorator for setting the default kwargs of `BaseHandler.crawl`. Any self.crawl with this callback will use this config. """ if _config is None: _config = {} _config.update(kwargs) def wrapper(func): func._config = _config return func return wrapper class NOTSET(object): pass def every(minutes=NOTSET, seconds=NOTSET): """ method will been called every minutes or seconds """ def wrapper(func): # mark the function with variable 'is_cronjob=True', the function would be # collected into the list Handler._cron_jobs by meta class func.is_cronjob = True # collect interval and unify to seconds, it's used in meta class. See the # comments in meta class. func.tick = minutes * 60 + seconds return func if inspect.isfunction(minutes): func = minutes minutes = 1 seconds = 0 return wrapper(func) if minutes is NOTSET: if seconds is NOTSET: minutes = 1 seconds = 0 else: minutes = 0 if seconds is NOTSET: seconds = 0 return wrapper class BaseHandlerMeta(type): def __new__(cls, name, bases, attrs): # A list of all functions which is marked as 'is_cronjob=True' cron_jobs = [] # The min_tick is the greatest common divisor(GCD) of the interval of cronjobs # this value would be queried by scheduler when the project initial loaded. # Scheudler may only send _on_cronjob task every min_tick seconds. It can reduce # the number of tasks sent from scheduler. min_tick = 0 for each in attrs.values(): if inspect.isfunction(each) and getattr(each, 'is_cronjob', False): cron_jobs.append(each) min_tick = fractions.gcd(min_tick, each.tick) newcls = type.__new__(cls, name, bases, attrs) newcls._cron_jobs = cron_jobs newcls._min_tick = min_tick return newcls @add_metaclass(BaseHandlerMeta) class BaseHandler(object): """ BaseHandler for all scripts. `BaseHandler.run` is the main method to handler the task. """ crawl_config = {} project_name = None _cron_jobs = [] _min_tick = 0 __env__ = {'not_inited': True} retry_delay = {} def _reset(self): """ reset before each task """ self._extinfo = {} self._messages = [] self._follows = [] self._follows_keys = set() def _run_func(self, function, *arguments): """ Running callback function with requested number of arguments 这傻逼,为啥在调用 self._run_func 时,最后一定要多传一个参数进来 comment by qiulimao@2016.06.07 以下来自binux的回答:@2016.11.28 因为 callback 可以 接受多个参数。 如果你不需要 task,那么 def callback(self,response), 如果你需要 task 的时候 def callback(self,response,task), _run_func 保证了无论哪种函数原型,都能获得需要的参数。 """ args, varargs, keywords, defaults = inspect.getargspec(function) # 这个 _run_func是 一个class instance 方法,第一参数默认为self,所以 函数接收参数个数为: args-1 # 最后多传一个参数进来,我真的不理解。 return function(*arguments[:len(args) - 1]) def _run_task(self, task, response): """ Finding callback specified by `task['callback']` raising status error for it if needed. """ process = task.get('process', {}) callback = process.get('callback', '__call__') if not hasattr(self, callback): raise NotImplementedError("self.%s() not implemented!" % callback) function = getattr(self, callback) # do not run_func when 304 if response.status_code == 304 and not getattr(function, '_catch_status_code_error', False): return None if not getattr(function, '_catch_status_code_error', False): response.raise_for_status() # callback 也只接收一个参数,这里给了两个参数,所以在程序当中你可以使用两个参数 return self._run_func(function, response, task) def _deal_result(self,result,response,task): """ we alway come in this situation: differrent crawl callback has differrent data schema,we need to save them seperatly. in preview version,all data yield by crawl callback use the same on_result method. But this time: you can save data to differrent schema by offer an `on_result__callback` method in your class just like this: def list_page(self,response): for item in response.xpath("//div[@class="item-list"]"): yield { "url":response.url, "updatetime":item.xpath("./text()"), } self.crawl(a_url,callback=self.next_url) def next_url(self,response): return { "more_detail":response.text, } def on_result__list_page(self,result): save_list_page_result_to_table_one() def on_result__next_url(self,result): save_list_page_result_to_table_two() by default: if you don't offer on_result_(%callmethod)s we fall to use self.on_result powered by qiulimao@2016.06.07 """ process = task.get('process', {}) result_loader = process.get('callback') current_result_dealer = "on_result__%s"%result_loader if result_loader else "on_result" function = getattr(self,current_result_dealer) if hasattr(self, current_result_dealer) else getattr(self,"on_result") # this method is tested,in most cases,it won't raise exceptions, # so let's apply this method first. #self.on_result(result) # 使用这种方式好像有点可以重载的样子 # 如果客户端重写了on_result 方法,那么它可以定决定是否使用result,response,task这三个参数,目前on_result只使用了result参数 self._run_func(self.on_result,result,response,task) if function.__name__ != "on_result": # 不要让on_result方法重复运行. # exceptions will be catch by self.run_task method # 那么同理:on_result__callback 同样可以接收三个参数 self._run_func(function,result,response,task) def run_task(self, module, task, response): """ Processing the task, catching exceptions and logs, return a `ProcessorResult` object """ logger = module.logger result = None exception = None stdout = sys.stdout self.task = task if isinstance(response, dict): response = rebuild_response(response) self.response = response self.save = (task.get('track') or {}).get('save', {}) try: if self.__env__.get('enable_stdout_capture', True): sys.stdout = ListO(module.log_buffer) self._reset() result = self._run_task(task, response) if inspect.isgenerator(result): for r in result: # 为啥 on_result 只接收一个参数,这里给了它三个参数?因为这样重写on_result更灵活,可以接收三个以内的参数 #self._run_func(self.on_result, r, response, task) self._deal_result(r,response,task) else: #self._run_func(self.on_result, result, response, task) self._deal_result(result,response,task) except Exception as e: logger.exception(e) exception = e finally: follows = self._follows messages = self._messages logs = list(module.log_buffer) extinfo = self._extinfo save = self.save sys.stdout = stdout self.task = None self.response = None self.save = None module.log_buffer[:] = [] return ProcessorResult(result, follows, messages, logs, exception, extinfo, save) def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} assert len(url) < 1024, "Maximum (1024) URL length error." if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr(self, callback): func = getattr(self, callback) elif six.callable(callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) for k, v in iteritems(self.crawl_config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}), kwargs.pop('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update', 'auto_recrawl'): if key in kwargs: schedule[key] = kwargs.pop(key) task['schedule'] = schedule fetch = {} for key in ( 'method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'last_modified', 'save', 'js_run_at', 'js_script', 'js_viewport_width', 'js_viewport_height', 'load_images', 'fetch_type', 'use_gzip', 'validate_cert', 'max_redirects', 'robots_txt' ): if key in kwargs: fetch[key] = kwargs.pop(key) task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs: process[key] = kwargs.pop(key) task['process'] = process task['project'] = self.project_name task['url'] = url if 'taskid' in kwargs: task['taskid'] = kwargs.pop('taskid') else: task['taskid'] = self.get_taskid(task) if kwargs: raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys()) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task def get_taskid(self, task): '''Generate taskid by information of task md5(url) by default, override me''' return md5string(task['url']) # apis def crawl(self, url, **kwargs): ''' available params: url callback method params data files headers timeout allow_redirects cookies proxy etag last_modified auto_recrawl fetch_type js_run_at js_script js_viewport_width js_viewport_height load_images priority retries exetime age itag save taskid full documents: http://weblocust.readthedocs.org/en/latest/apis/self.crawl/ ''' if isinstance(url, six.string_types) and url.startswith('curl '): curl_kwargs = curl_to_arguments(url) url = curl_kwargs.pop('urls') for k, v in iteritems(curl_kwargs): kwargs.setdefault(k, v) if isinstance(url, six.string_types): return self._crawl(url, **kwargs) elif hasattr(url, "__iter__"): result = [] for each in url: result.append(self._crawl(each, **kwargs)) return result def is_debugger(self): """Return true if running in debugger""" return self.__env__.get('debugger') def send_message(self, project, msg, url='data:,on_message'): """Send messages to other project.""" self._messages.append((project, msg, url)) def on_message(self, project, msg): """Receive message from other project, override me.""" pass def on_result(self, result): """Receiving returns from other callback, override me.""" if not result: return assert self.task, "on_result can't outside a callback." if self.is_debugger(): pprint(result) if self.__env__.get('result_queue'): self.__env__['result_queue'].put((self.task, result)) @not_send_status def _on_message(self, response): project, msg = response.save return self.on_message(project, msg) @not_send_status def _on_cronjob(self, response, task): if (not response.save or not isinstance(response.save, dict) or 'tick' not in response.save): return # When triggered, a '_on_cronjob' task is sent from scheudler with 'tick' in # Response.save. Scheduler may at least send the trigger task every GCD of the # inverval of the cronjobs. The method should check the tick for each cronjob # function to confirm the execute interval. for cronjob in self._cron_jobs: if response.save['tick'] % cronjob.tick != 0: continue function = cronjob.__get__(self, self.__class__) self._run_func(function, response, task) def _on_get_info(self, response, task): """Sending runtime infomation about this script.""" for each in response.save or []: if each == 'min_tick': self.save[each] = self._min_tick elif each == 'retry_delay': if not isinstance(self.retry_delay, dict): self.retry_delay = {'': self.retry_delay} self.save[each] = self.retry_delay @not_send_status def on_finished(self, response, task): pass PKd6[I* y weblocust/libs/sample_handler.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on: __DATE__ # Project: __PROJECT_NAME__ # # .-"``"-. # /______; \ # {_______}\| # (/ a a \)(_) # (.-.).-.) # ____________ooo__( ^ )________________ # / '-.___.-' \ #| Hello Chebyshev: | #| Are you writting BUUUUGs again? | #| please make good explaination Notes. | #| | #| --May you have fun with this framework. | #| tips from qiulimao@2016.06.07 | # \_____________________________ooo____________/ # |_ | _| jgs # \___|___/ # {___|___} # |_ | _| # /-'Y'-\ # (__/ \__) # from weblocust.libs.base_handler import * from weblocust.libs.useragent import IphoneSafari,LinuxChrome from weblocust.libs.cleaners import TakeFirst,JoinCleaner,StripBlankMoreThan2 from weblocust.libs.cleaners import reduceclean,mapclean,mapreduce class Handler(BaseHandler): crawl_config = { 'headers': {'User-Agent': LinuxChrome} } @every(minutes=24 * 60) def on_start(self): self.crawl('__START_URL__', callback=self.index_page) @config(age=60) def index_page(self, response): """ response.xpath method is available ```python for url in response.xpath("//a/@href"): self.crawl(url,callback=self.detail_page) ``` this does the same effect as below: """ for each in response.doc('a[href^="http"]').items(): self.crawl(each.attr.href, callback=self.detail_page) @config(priority=2) def detail_page(self, response): return { "url": response.url, "title": response.doc('title').text(), } def on_result__detail_page(self,result): """ you can do some jobs to persist datas to you database, if a callback returned or yielded value,`on_result__callback` method will be called,otherwise, this kind of function are garbage. However,items are saved to mongodb by default,you can read datas from mongodb,in other word,you don't need to offer this method. Reading datas from mongodb is the recomanded way. """ pass PKd6[Iw7171weblocust/libs/counter.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2012-11-14 17:09:50 # Modified on 2016-10-26 20:46:20 from __future__ import unicode_literals, division, absolute_import import time import logging from collections import deque try: from UserDict import DictMixin except ImportError: from collections import Mapping as DictMixin import six from six import iteritems from six.moves import cPickle class BaseCounter(object): def __init__(self): raise NotImplementedError def event(self, value=1): """Fire a event.""" raise NotImplementedError def value(self, value): """Set counter value.""" raise NotImplementedError @property def avg(self): """Get average value""" raise NotImplementedError @property def sum(self): """Get sum of counter""" raise NotImplementedError def empty(self): """Clear counter""" raise NotImplementedError class TotalCounter(BaseCounter): """Total counter""" def __init__(self): self.cnt = 0 def event(self, value=1): self.cnt += value def value(self, value): self.cnt = value @property def avg(self): return self.cnt @property def sum(self): return self.cnt def empty(self): return self.cnt == 0 class AverageWindowCounter(BaseCounter): """ Record last N(window) value """ def __init__(self, window_size=300): self.window_size = window_size self.values = deque(maxlen=window_size) def event(self, value=1): self.values.append(value) value = event @property def avg(self): return self.sum / len(self.values) @property def sum(self): return sum(self.values) def empty(self): if not self.values: return True class TimebaseAverageEventCounter(BaseCounter): """ Record last window_size * window_interval seconds event. records will trim ever window_interval seconds """ def __init__(self, window_size=30, window_interval=10): self.max_window_size = window_size self.window_size = 0 self.window_interval = window_interval self.values = deque(maxlen=window_size) self.events = deque(maxlen=window_size) self.times = deque(maxlen=window_size) self.cache_value = 0 self.cache_event = 0 self.cache_start = None self._first_data_time = None def event(self, value=1): now = time.time() if self._first_data_time is None: self._first_data_time = now if self.cache_start is None: self.cache_value = value self.cache_event = 1 self.cache_start = now elif now - self.cache_start > self.window_interval: self.values.append(self.cache_value) self.events.append(self.cache_event) self.times.append(self.cache_start) self.on_append(self.cache_value, self.cache_start) self.cache_value = value self.cache_event = 1 self.cache_start = now else: self.cache_value += value self.cache_event += 1 return self def value(self, value): self.cache_value = value def _trim_window(self): now = time.time() if self.cache_start and now - self.cache_start > self.window_interval: self.values.append(self.cache_value) self.events.append(self.cache_event) self.times.append(self.cache_start) self.on_append(self.cache_value, self.cache_start) self.cache_value = 0 self.cache_start = None if self.window_size != self.max_window_size and self._first_data_time is not None: time_passed = now - self._first_data_time self.window_size = min(self.max_window_size, time_passed / self.window_interval) window_limit = now - self.window_size * self.window_interval while self.times and self.times[0] < window_limit: self.times.popleft() self.events.popleft() self.values.popleft() @property def avg(self): events = (sum(self.events) + self.cache_event) if not events: return 0 return float(self.sum) / events @property def sum(self): self._trim_window() return sum(self.values) + self.cache_value def empty(self): self._trim_window() if not self.values and not self.cache_start: return True def on_append(self, value, time): pass class TimebaseAverageWindowCounter(BaseCounter): """ Record last window_size * window_interval seconds values. records will trim ever window_interval seconds """ def __init__(self, window_size=30, window_interval=10): self.max_window_size = window_size self.window_size = 0 self.window_interval = window_interval self.values = deque(maxlen=window_size) self.times = deque(maxlen=window_size) self.cache_value = 0 self.cache_start = None self._first_data_time = None def event(self, value=1): now = time.time() if self._first_data_time is None: self._first_data_time = now if self.cache_start is None: self.cache_value = value self.cache_start = now elif now - self.cache_start > self.window_interval: self.values.append(self.cache_value) self.times.append(self.cache_start) self.on_append(self.cache_value, self.cache_start) self.cache_value = value self.cache_start = now else: self.cache_value += value return self def value(self, value): self.cache_value = value def _trim_window(self): now = time.time() if self.cache_start and now - self.cache_start > self.window_interval: self.values.append(self.cache_value) self.times.append(self.cache_start) self.on_append(self.cache_value, self.cache_start) self.cache_value = 0 self.cache_start = None if self.window_size != self.max_window_size and self._first_data_time is not None: time_passed = now - self._first_data_time self.window_size = min(self.max_window_size, time_passed / self.window_interval) window_limit = now - self.window_size * self.window_interval while self.times and self.times[0] < window_limit: self.times.popleft() self.values.popleft() @property def avg(self): sum = float(self.sum) if not self.window_size: return 0 return sum / self.window_size / self.window_interval @property def sum(self): self._trim_window() return sum(self.values) + self.cache_value def empty(self): self._trim_window() if not self.values and not self.cache_start: return True def on_append(self, value, time): pass class CounterValue(DictMixin): """ A dict like value item for CounterManager. """ def __init__(self, manager, keys): self.manager = manager self._keys = keys def __getitem__(self, key): if key == '__value__': key = self._keys return self.manager.counters[key] else: key = self._keys + (key, ) available_keys = [] for _key in self.manager.counters.keys(): if _key[:len(key)] == key: available_keys.append(_key) if len(available_keys) == 0: raise KeyError elif len(available_keys) == 1: if available_keys[0] == key: return self.manager.counters[key] else: return CounterValue(self.manager, key) else: return CounterValue(self.manager, key) def __len__(self): return len(self.keys()) def __iter__(self): return iter(self.keys()) def __contains__(self, key): return key in self.keys() def keys(self): result = set() for key in self.manager.counters.keys(): if key[:len(self._keys)] == self._keys: key = key[len(self._keys):] result.add(key[0] if key else '__value__') return result def to_dict(self, get_value=None): """Dump counters as a dict""" result = {} for key, value in iteritems(self): if isinstance(value, BaseCounter): if get_value is not None: value = getattr(value, get_value) result[key] = value else: result[key] = value.to_dict(get_value) return result class CounterManager(DictMixin): """ A dict like counter manager. When using a tuple as event key, say: ('foo', 'bar'), You can visite counter with manager['foo']['bar']. Or get all counters which first element is 'foo' by manager['foo']. It's useful for a group of counters. """ def __init__(self, cls=TimebaseAverageWindowCounter): """init manager with Counter cls""" self.cls = cls self.counters = {} def event(self, key, value=1): """Fire a event of a counter by counter key""" if isinstance(key, six.string_types): key = (key, ) assert isinstance(key, tuple), "event key type error" if key not in self.counters: self.counters[key] = self.cls() self.counters[key].event(value) return self def value(self, key, value=1): """Set value of a counter by counter key""" if isinstance(key, six.string_types): key = (key, ) assert isinstance(key, tuple), "event key type error" if key not in self.counters: self.counters[key] = self.cls() self.counters[key].value(value) return self def trim(self): """Clear not used counters""" for key, value in list(iteritems(self.counters)): if value.empty(): del self.counters[key] def __getitem__(self, key): key = (key, ) available_keys = [] for _key in self.counters.keys(): if _key[:len(key)] == key: available_keys.append(_key) if len(available_keys) == 0: raise KeyError elif len(available_keys) == 1: if available_keys[0] == key: return self.counters[key] else: return CounterValue(self, key) else: return CounterValue(self, key) def __delitem__(self, key): key = (key, ) available_keys = [] for _key in self.counters.keys(): if _key[:len(key)] == key: available_keys.append(_key) for _key in available_keys: del self.counters[_key] def __iter__(self): return iter(self.keys()) def __len__(self): return len(self.keys()) def keys(self): result = set() for key in self.counters.keys(): result.add(key[0] if key else ()) return result def to_dict(self, get_value=None): """Dump counters as a dict""" self.trim() result = {} for key, value in iteritems(self): if isinstance(value, BaseCounter): if get_value is not None: value = getattr(value, get_value) result[key] = value else: result[key] = value.to_dict(get_value) return result def dump(self, filename): """Dump counters to file""" try: with open(filename, 'wb') as fp: cPickle.dump(self.counters, fp) except: logging.error("can't dump counter to file: %s" % filename) return False return True def load(self, filename): """Load counters to file""" try: with open(filename) as fp: self.counters = cPickle.load(fp) except: logging.debug("can't load counter from file: %s" % filename) return False return True PKd6[I'YYweblocust/libs/url.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2012-11-09 14:39:57 # Modified on 2016-10-26 20:46:20 import mimetypes import six import shlex from six.moves.urllib.parse import urlparse, urlunparse from requests.models import RequestEncodingMixin def get_content_type(filename): """Guessing file type by filename""" return mimetypes.guess_type(filename)[0] or 'application/octet-stream' _encode_params = RequestEncodingMixin._encode_params def _encode_multipart_formdata(fields, files): body, content_type = RequestEncodingMixin._encode_files(files, fields) return content_type, body def _build_url(url, _params): """Build the actual URL to use.""" # Support for unicode domain names and paths. scheme, netloc, path, params, query, fragment = urlparse(url) netloc = netloc.encode('idna').decode('utf-8') if not path: path = '/' if six.PY2: if isinstance(scheme, six.text_type): scheme = scheme.encode('utf-8') if isinstance(netloc, six.text_type): netloc = netloc.encode('utf-8') if isinstance(path, six.text_type): path = path.encode('utf-8') if isinstance(params, six.text_type): params = params.encode('utf-8') if isinstance(query, six.text_type): query = query.encode('utf-8') if isinstance(fragment, six.text_type): fragment = fragment.encode('utf-8') enc_params = _encode_params(_params) if enc_params: if query: query = '%s&%s' % (query, enc_params) else: query = enc_params url = (urlunparse([scheme, netloc, path, params, query, fragment])) return url def quote_chinese(url, encodeing="utf-8"): """Quote non-ascii characters""" if isinstance(url, six.text_type): return quote_chinese(url.encode(encodeing)) if six.PY3: res = [six.int2byte(b).decode('latin-1') if b < 128 else '%%%02X' % b for b in url] else: res = [b if ord(b) < 128 else '%%%02X' % ord(b) for b in url] return "".join(res) def curl_to_arguments(curl): kwargs = {} headers = {} command = None urls = [] current_opt = None for part in shlex.split(curl): if command is None: # curl command = part elif not part.startswith('-') and not current_opt: # waiting for url urls.append(part) elif current_opt is None and part.startswith('-'): # flags if part == '--compressed': kwargs['use_gzip'] = True else: current_opt = part else: # option if current_opt is None: raise TypeError('Unknow curl argument: %s' % part) elif current_opt in ('-H', '--header'): key_value = part.split(':', 1) if len(key_value) == 2: key, value = key_value headers[key.strip()] = value.strip() elif current_opt in ('-d', '--data'): kwargs['data'] = part elif current_opt in ('--data-binary'): if part[0] == '$': part = part[1:] kwargs['data'] = part elif current_opt in ('-X', '--request'): kwargs['method'] = part else: raise TypeError('Unknow curl option: %s' % current_opt) current_opt = None if not urls: raise TypeError('curl: no URL specified!') if current_opt: raise TypeError('Unknow curl option: %s' % current_opt) kwargs['urls'] = urls if headers: kwargs['headers'] = headers return kwargs PKd6[I: weblocust/libs/cleaners.py#coding:utf-8 import re from .response import Response from collections import Iterable #__all__ = ['JoinCleaner','StripBlankMoreThan2','reduceclean','mapclean','mapreduce',"TakeFirst"] class BaseCleaner(object): """ basic cleaner """ @classmethod def doapply(cls,raw_input): raise NotImplementedError class JoinCleaner(BaseCleaner): """ join list to one item """ seperator = "" @classmethod def doapply(cls,raw_input): if not isinstance(raw_input,list): if isinstance(raw_input,unicode): return raw_input elif isinstance(raw_input,str): return raw_input.decode("utf-8") return cls.seperator.join(raw_input) @classmethod def set_seperator(cls,seperator): cls.seperator = seperator class StripBlankMoreThan2(BaseCleaner): """ strip black more than 2 """ @classmethod def doapply(cls,raw_input): decoded_input = raw_input if isinstance(raw_input,str) else raw_input.encode("utf-8") final_result = re.sub(r'\s{2,}'," ",decoded_input) return final_result.decode("utf-8") class TakeFirst(BaseCleaner): """ return the fist of the list """ @classmethod def doapply(cls,raw_input): if raw_input and isinstance(raw_input,Iterable): return raw_input[0] else: return raw_input class DoNothingCleaner(BaseCleaner): """ """ @classmethod def doapply(cls,raw_input): return raw_input def reduceclean(response,xpath_selector,*cleaners): """ apply a list of cleaner on the item """ raw_result = Response.extract(response.xpath(xpath_selector)) final_result = reduce(lambda r,c:c.doapply(r),cleaners,raw_result) return final_result def mapclean(response,xpath_selector,*cleaners): """ apply a list of cleaner on every element of the list item """ raw_result = Response.extract(response.xpath(xpath_selector)) final_result = reduce(lambda r,c:map(lambda _r:c.doapply(_r),r),cleaners,raw_result) return final_result def mapreduce(response,xpath_selector,map_cleaners,reduce_cleaners): """ apply a list map_cleaners on very element of the list item, then apply a list of reduce_cleaner on the result of map_cleaners """ raw_result = Response.extract(response.xpath(xpath_selector)) maped_result = mapclean(response,xpath_selector,*map_cleaners) final_result = reduce(lambda r,c:c.doapply(r),reduce_cleaners,maped_result) return final_resultPKd6[I3 weblocust/libs/response.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2012-11-02 11:16:02 # Modified on 2016-10-26 20:46:20 import six import json import chardet import lxml.html import lxml.etree from pyquery import PyQuery from requests.structures import CaseInsensitiveDict from requests.utils import get_encoding_from_headers try: from requests.utils import get_encodings_from_content except ImportError: get_encodings_from_content = None from requests import HTTPError from weblocust.libs import utils from six.moves.urllib.parse import urljoin class Response(object): def __init__(self): self.status_code = None self.url = None self.orig_url = None self.headers = CaseInsensitiveDict() self.content = '' self.cookies = {} self.error = None self.save = None self.js_script_result = None self.time = 0 def __repr__(self): return u'' % self.status_code def __bool__(self): """Returns true if `status_code` is 200 and no error""" return self.ok def __nonzero__(self): """Returns true if `status_code` is 200 and no error.""" return self.ok @property def ok(self): """Return true if `status_code` is 200 and no error.""" try: self.raise_for_status() except: return False return True @property def encoding(self): """ encoding of Response.content. if Response.encoding is None, encoding will be guessed by header or content or chardet if available. """ if hasattr(self, '_encoding'): return self._encoding # content is unicode if isinstance(self.content, six.text_type): return 'unicode' # Try charset from content-type encoding = get_encoding_from_headers(self.headers) if encoding == 'ISO-8859-1': encoding = None # Try charset from content if not encoding and get_encodings_from_content: if six.PY3: encoding = get_encodings_from_content(utils.pretty_unicode(self.content[:100])) else: encoding = get_encodings_from_content(self.content) encoding = encoding and encoding[0] or None # Fallback to auto-detected encoding. if not encoding and chardet is not None: encoding = chardet.detect(self.content)['encoding'] if encoding and encoding.lower() == 'gb2312': encoding = 'gb18030' self._encoding = encoding or 'utf-8' return self._encoding @encoding.setter def encoding(self, value): """ set encoding of content manually it will overwrite the guessed encoding """ self._encoding = value self._text = None @property def text(self): """ Content of the response, in unicode. if Response.encoding is None and chardet module is available, encoding will be guessed. """ if hasattr(self, '_text') and self._text: return self._text if not self.content: return u'' if isinstance(self.content, six.text_type): return self.content content = None encoding = self.encoding # Decode unicode from given encoding. try: content = self.content.decode(encoding, 'replace') except LookupError: # A LookupError is raised if the encoding was not found which could # indicate a misspelling or similar mistake. # # So we try blindly encoding. content = self.content.decode('utf-8', 'replace') self._text = content return content @property def json(self): """Returns the json-encoded content of the response, if any.""" if hasattr(self, '_json'): return self._json try: self._json = json.loads(self.text or self.content) except ValueError: self._json = None return self._json @property def doc(self): """Returns a PyQuery object of the response's content""" if hasattr(self, '_doc'): return self._doc elements = self.etree doc = self._doc = PyQuery(elements) doc.make_links_absolute(utils.text(self.url)) return doc @property def etree(self): """Returns a lxml object of the response's content that can be selected by xpath""" if not hasattr(self, '_elements'): try: parser = lxml.html.HTMLParser(encoding=self.encoding) self._elements = lxml.html.fromstring(self.content, parser=parser) except LookupError: # lxml would raise LookupError when encoding not supported # try fromstring without encoding instead. # on windows, unicode is not availabe as encoding for lxml self._elements = lxml.html.fromstring(self.content) if isinstance(self._elements, lxml.etree._ElementTree): self._elements = self._elements.getroot() return self._elements ## # the below is to make weblocust's response can be used as scrapy ## @property def xpath(self): """ shortcut for etree.xpath add by qiulimao@2016.05""" return self.etree.xpath @staticmethod def extract(items): """ extract xpath items(lxml.html.HtmlElement) to unicode if items is a lxml.html.HtmlElement return string if items is a lxml.html.HtmlElement list return [string1,string2] if items is a list but not lxml.html.HtmlElement items list ,return [*items] """ def htmlelement2string(item): if isinstance(item,lxml.html.HtmlElement): return lxml.etree.tostring(item).decode("utf-8") else: return item if isinstance(items,list): #if len(items) == 1: # return htmlelement2string(items[0]) #else: return map(htmlelement2string,items) else: return htmlelement2string(items) def body_as_unicode(self): """ add this method to be campatible with scrapy """ return self.text def urljoin(self, url): """Join this Response's url with a possible relative url to form an absolute interpretation of the latter.""" return urljoin(self.url, url) ## # the upper is to make weblocust's response can be used as scrapy ## def raise_for_status(self, allow_redirects=True): """Raises stored :class:`HTTPError` or :class:`URLError`, if one occurred.""" if self.status_code == 304: return elif self.error: http_error = HTTPError(self.error) elif (self.status_code >= 300) and (self.status_code < 400) and not allow_redirects: http_error = HTTPError('%s Redirection' % (self.status_code)) elif (self.status_code >= 400) and (self.status_code < 500): http_error = HTTPError('%s Client Error' % (self.status_code)) elif (self.status_code >= 500) and (self.status_code < 600): http_error = HTTPError('%s Server Error' % (self.status_code)) else: return http_error.response = self raise http_error def isok(self): try: self.raise_for_status() return True except: return False def rebuild_response(r): """ 把raw response 转化为weblocust当中的response """ response = Response() response.status_code = r.get('status_code', 599) response.url = r.get('url', '') response.headers = CaseInsensitiveDict(r.get('headers', {})) response.content = r.get('content', '') response.cookies = r.get('cookies', {}) response.error = r.get('error') response.time = r.get('time', 0) response.orig_url = r.get('orig_url', response.url) response.js_script_result = r.get('js_script_result') response.save = r.get('save') return response PKd6[Iweblocust/libs/__init__.pyPKd6[Iį0FFweblocust/libs/ListIO.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-26 23:41:51 # Modified on 2016-10-26 20:46:20 class ListO(object): """A StringO write to list.""" def __init__(self, buffer=None): self._buffer = buffer if self._buffer is None: self._buffer = [] def isatty(self): return False def close(self): pass def flush(self): pass def seek(self, n, mode=0): pass def readline(self): pass def reset(self): pass def write(self, x): self._buffer.append(x) def writelines(self, x): self._buffer.extend(x) PKd6[I 'weblocust/libs/multiprocessing_queue.pyimport six import platform import multiprocessing from multiprocessing.queues import Queue as BaseQueue # The SharedCounter and Queue classes come from: # https://github.com/vterron/lemon/commit/9ca6b4b class SharedCounter(object): """ A synchronized shared counter. The locking done by multiprocessing.Value ensures that only a single process or thread may read or write the in-memory ctypes object. However, in order to do n += 1, Python performs a read followed by a write, so a second process may read the old value before the new one is written by the first process. The solution is to use a multiprocessing.Lock to guarantee the atomicity of the modifications to Value. This class comes almost entirely from Eli Bendersky's blog: http://eli.thegreenplace.net/2012/01/04/shared-counter-with-pythons-multiprocessing/ """ def __init__(self, n=0): self.count = multiprocessing.Value('i', n) def increment(self, n=1): """ Increment the counter by n (default = 1) """ with self.count.get_lock(): self.count.value += n @property def value(self): """ Return the value of the counter """ return self.count.value class MultiProcessingQueue(BaseQueue): """ A portable implementation of multiprocessing.Queue. Because of multithreading / multiprocessing semantics, Queue.qsize() may raise the NotImplementedError exception on Unix platforms like Mac OS X where sem_getvalue() is not implemented. This subclass addresses this problem by using a synchronized shared counter (initialized to zero) and increasing / decreasing its value every time the put() and get() methods are called, respectively. This not only prevents NotImplementedError from being raised, but also allows us to implement a reliable version of both qsize() and empty(). """ def __init__(self, *args, **kwargs): super(MultiProcessingQueue, self).__init__(*args, **kwargs) self.size = SharedCounter(0) def put(self, *args, **kwargs): self.size.increment(1) super(MultiProcessingQueue, self).put(*args, **kwargs) def get(self, *args, **kwargs): v = super(MultiProcessingQueue, self).get(*args, **kwargs) self.size.increment(-1) return v def qsize(self): """ Reliable implementation of multiprocessing.Queue.qsize() """ return self.size.value if platform.system() == 'Darwin': if hasattr(multiprocessing, 'get_context'): # for py34 def Queue(maxsize=0): return MultiProcessingQueue(maxsize, ctx=multiprocessing.get_context()) else: def Queue(maxsize=0): return MultiProcessingQueue(maxsize) else: from multiprocessing import Queue # flake8: noqa PKd6[I..weblocust/libs/utils.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2012-11-06 11:50:13 # Modified on 2016-10-26 20:46:20 import logging import hashlib import datetime import socket import base64 import six from six import iteritems md5string = lambda x: hashlib.md5(utf8(x)).hexdigest() class ReadOnlyDict(dict): """A Read Only Dict""" def __setitem__(self, key, value): raise Exception("dict is read-only") def getitem(obj, key=0, default=None): """Get first element of list or return default""" try: return obj[key] except: return default def hide_me(tb, g=globals()): """Hide stack traceback of given stack""" base_tb = tb try: while tb and tb.tb_frame.f_globals is not g: tb = tb.tb_next while tb and tb.tb_frame.f_globals is g: tb = tb.tb_next except Exception as e: logging.exception(e) tb = base_tb if not tb: tb = base_tb return tb def run_in_thread(func, *args, **kwargs): """Run function in thread, return a Thread object""" from threading import Thread thread = Thread(target=func, args=args, kwargs=kwargs) thread.daemon = True thread.start() return thread def run_in_subprocess(func, *args, **kwargs): """Run function in subprocess, return a Process object""" from multiprocessing import Process thread = Process(target=func, args=args, kwargs=kwargs) thread.daemon = True thread.start() return thread def format_date(date, gmt_offset=0, relative=True, shorter=False, full_format=False): """Formats the given date (which should be GMT). By default, we return a relative time (e.g., "2 minutes ago"). You can return an absolute date string with ``relative=False``. You can force a full format date ("July 10, 1980") with ``full_format=True``. This method is primarily intended for dates in the past. For dates in the future, we fall back to full format. From tornado """ if not date: return '-' if isinstance(date, float) or isinstance(date, int): date = datetime.datetime.utcfromtimestamp(date) now = datetime.datetime.utcnow() if date > now: if relative and (date - now).seconds < 60: # Due to click skew, things are some things slightly # in the future. Round timestamps in the immediate # future down to now in relative mode. date = now else: # Otherwise, future dates always use the full format. full_format = True local_date = date - datetime.timedelta(minutes=gmt_offset) local_now = now - datetime.timedelta(minutes=gmt_offset) local_yesterday = local_now - datetime.timedelta(hours=24) difference = now - date seconds = difference.seconds days = difference.days format = None if not full_format: if relative and days == 0: if seconds < 50: return ("1 second ago" if seconds <= 1 else "%(seconds)d seconds ago") % {"seconds": seconds} if seconds < 50 * 60: minutes = round(seconds / 60.0) return ("1 minute ago" if minutes <= 1 else "%(minutes)d minutes ago") % {"minutes": minutes} hours = round(seconds / (60.0 * 60)) return ("1 hour ago" if hours <= 1 else "%(hours)d hours ago") % {"hours": hours} if days == 0: format = "%(time)s" elif days == 1 and local_date.day == local_yesterday.day and \ relative: format = "yesterday" if shorter else "yesterday at %(time)s" elif days < 5: format = "%(weekday)s" if shorter else "%(weekday)s at %(time)s" elif days < 334: # 11mo, since confusing for same month last year format = "%(month)s-%(day)s" if shorter else \ "%(month)s-%(day)s at %(time)s" if format is None: format = "%(month_name)s %(day)s, %(year)s" if shorter else \ "%(month_name)s %(day)s, %(year)s at %(time)s" str_time = "%d:%02d" % (local_date.hour, local_date.minute) return format % { "month_name": local_date.strftime('%b'), "weekday": local_date.strftime('%A'), "day": str(local_date.day), "year": str(local_date.year), "month": local_date.month, "time": str_time } class TimeoutError(Exception): pass try: import signal if not hasattr(signal, 'SIGALRM'): raise ImportError('signal') class timeout: """ Time limit of command with timeout(3): time.sleep(10) """ def __init__(self, seconds=1, error_message='Timeout'): self.seconds = seconds self.error_message = error_message def handle_timeout(self, signum, frame): raise TimeoutError(self.error_message) def __enter__(self): if self.seconds: signal.signal(signal.SIGALRM, self.handle_timeout) signal.alarm(self.seconds) def __exit__(self, type, value, traceback): if self.seconds: signal.alarm(0) except ImportError: class timeout: """ Time limit of command (for windows) """ def __init__(self, seconds=1, error_message='Timeout'): pass def __enter__(self): pass def __exit__(self, type, value, traceback): pass def utf8(string): """ Make sure string is utf8 encoded bytes. If parameter is a object, object.__str__ will been called before encode as bytes """ if isinstance(string, six.text_type): return string.encode('utf8') elif isinstance(string, six.binary_type): return string else: return six.text_type(string).encode('utf8') def text(string, encoding='utf8'): """ Make sure string is unicode type, decode with given encoding if it's not. If parameter is a object, object.__str__ will been called """ if isinstance(string, six.text_type): return string elif isinstance(string, six.binary_type): return string.decode(encoding) else: return six.text_type(string) def pretty_unicode(string): """ Make sure string is unicode, try to decode with utf8, or unicode escaped string if failed. """ if isinstance(string, six.text_type): return string try: return string.decode("utf8") except UnicodeDecodeError: return string.decode('Latin-1').encode('unicode_escape').decode("utf8") def unicode_string(string): """ Make sure string is unicode, try to default with utf8, or base64 if failed. can been decode by `decode_unicode_string` """ if isinstance(string, six.text_type): return string try: return string.decode("utf8") except UnicodeDecodeError: return '[BASE64-DATA]' + base64.b64encode(string) + '[/BASE64-DATA]' def unicode_dict(_dict): """ Make sure keys and values of dict is unicode. """ r = {} for k, v in iteritems(_dict): r[unicode_obj(k)] = unicode_obj(v) return r def unicode_list(_list): """ Make sure every element in list is unicode. bytes will encode in base64 """ return [unicode_obj(x) for x in _list] def unicode_obj(obj): """ Make sure keys and values of dict/list/tuple is unicode. bytes will encode in base64. Can been decode by `decode_unicode_obj` """ if isinstance(obj, dict): return unicode_dict(obj) elif isinstance(obj, (list, tuple)): return unicode_list(obj) elif isinstance(obj, six.string_types): return unicode_string(obj) elif isinstance(obj, (int, float)): return obj elif obj is None: return obj else: try: return text(obj) except: return text(repr(obj)) def decode_unicode_string(string): """ Decode string encoded by `unicode_string` """ if string.startswith('[BASE64-DATA]') and string.endswith('[/BASE64-DATA]'): return base64.b64decode(string[len('[BASE64-DATA]'):-len('[/BASE64-DATA]')]) return string def decode_unicode_obj(obj): """ Decode unicoded dict/list/tuple encoded by `unicode_obj` """ if isinstance(obj, dict): r = {} for k, v in iteritems(obj): r[decode_unicode_string(k)] = decode_unicode_obj(v) return r elif isinstance(obj, six.string_types): return decode_unicode_string(obj) elif isinstance(obj, (list, tuple)): return [decode_unicode_obj(x) for x in obj] else: return obj class Get(object): """ Lazy value calculate for object """ def __init__(self, getter): self.getter = getter def __get__(self, instance, owner): return self.getter() class ObjectDict(dict): """ Object like dict, every dict[key] can visite by dict.key If dict[key] is `Get`, calculate it's value. """ def __getattr__(self, name): ret = self.__getitem__(name) if hasattr(ret, '__get__'): return ret.__get__(self, ObjectDict) return ret def load_object(name): """Load object from module""" if "." not in name: raise Exception('load object need module.object') module_name, object_name = name.rsplit('.', 1) if six.PY2: module = __import__(module_name, globals(), locals(), [utf8(object_name)], -1) else: module = __import__(module_name, globals(), locals(), [object_name]) return getattr(module, object_name) def get_python_console(namespace=None): """ Return a interactive python console instance with caller's stack """ if namespace is None: import inspect frame = inspect.currentframe() caller = frame.f_back if not caller: logging.error("can't find caller who start this console.") caller = frame namespace = dict(caller.f_globals) namespace.update(caller.f_locals) try: from IPython.terminal.interactiveshell import TerminalInteractiveShell shell = TerminalInteractiveShell(user_ns=namespace) except ImportError: try: import readline import rlcompleter readline.set_completer(rlcompleter.Completer(namespace).complete) readline.parse_and_bind("tab: complete") except ImportError: pass import code shell = code.InteractiveConsole(namespace) shell._quit = False def exit(): shell._quit = True def readfunc(prompt=""): if shell._quit: raise EOFError return six.moves.input(prompt) # inject exit method shell.ask_exit = exit shell.raw_input = readfunc return shell def python_console(namespace=None): """Start a interactive python console with caller's stack""" if namespace is None: import inspect frame = inspect.currentframe() caller = frame.f_back if not caller: logging.error("can't find caller who start this console.") caller = frame namespace = dict(caller.f_globals) namespace.update(caller.f_locals) return get_python_console(namespace=namespace).interact() def check_port_open(port, addr='127.0.0.1'): sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) result = sock.connect_ex((addr, port)) if result == 0: return True else: return False PKd6[Iz˭weblocust/libs/wsgi_xmlrpc.py# Copyright (c) 2006-2007 Open Source Applications Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Origin: https://code.google.com/p/wsgi-xmlrpc/ from six.moves.xmlrpc_server import SimpleXMLRPCDispatcher import logging logger = logging.getLogger(__name__) class WSGIXMLRPCApplication(object): """Application to handle requests to the XMLRPC service""" def __init__(self, instance=None, methods=[]): """Create windmill xmlrpc dispatcher""" try: self.dispatcher = SimpleXMLRPCDispatcher(allow_none=True, encoding=None) except TypeError: # python 2.4 self.dispatcher = SimpleXMLRPCDispatcher() if instance is not None: self.dispatcher.register_instance(instance) for method in methods: self.dispatcher.register_function(method) self.dispatcher.register_introspection_functions() def register_instance(self, instance): return self.dispatcher.register_instance(instance) def register_function(self, function, name=None): return self.dispatcher.register_function(function, name) def handler(self, environ, start_response): """XMLRPC service for windmill browser core to communicate with""" if environ['REQUEST_METHOD'] == 'POST': return self.handle_POST(environ, start_response) else: start_response("400 Bad request", [('Content-Type', 'text/plain')]) return [''] def handle_POST(self, environ, start_response): """Handles the HTTP POST request. Attempts to interpret all HTTP POST requests as XML-RPC calls, which are forwarded to the server's _dispatch method for handling. Most code taken from SimpleXMLRPCServer with modifications for wsgi and my custom dispatcher. """ try: # Get arguments by reading body of request. # We read this in chunks to avoid straining # socket.read(); around the 10 or 15Mb mark, some platforms # begin to have problems (bug #792570). length = int(environ['CONTENT_LENGTH']) data = environ['wsgi.input'].read(length) # In previous versions of SimpleXMLRPCServer, _dispatch # could be overridden in this class, instead of in # SimpleXMLRPCDispatcher. To maintain backwards compatibility, # check to see if a subclass implements _dispatch and # using that method if present. response = self.dispatcher._marshaled_dispatch( data, getattr(self.dispatcher, '_dispatch', None) ) response += b'\n' except Exception as e: # This should only happen if the module is buggy # internal error, report as HTTP server error logger.exception(e) start_response("500 Server error", [('Content-Type', 'text/plain')]) return [] else: # got a valid XML RPC response start_response("200 OK", [('Content-Type', 'text/xml'), ('Content-Length', str(len(response)),)]) return [response] def __call__(self, environ, start_response): return self.handler(environ, start_response) PKd6[Iweblocust/libs/result_dump.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2015-03-27 20:12:11 # Modified on 2016-10-26 20:46:20 import six import csv import json import itertools from io import StringIO, BytesIO from six import iteritems def result_formater(results): common_fields = None for result in results: result.setdefault('result', None) if isinstance(result['result'], dict): if common_fields is None: common_fields = set(result['result'].keys()) else: common_fields &= set(result['result'].keys()) else: common_fields = set() for result in results: result['result_formated'] = {} if not common_fields: result['others'] = result['result'] elif not isinstance(result['result'], dict): result['others'] = result['result'] else: result_formated = {} others = {} for key, value in iteritems(result['result']): if key in common_fields: result_formated[key] = value else: others[key] = value result['result_formated'] = result_formated result['others'] = others return common_fields or set(), results def dump_as_json(results, valid=False): first = True if valid: yield '[' for result in results: if valid: if first: first = False else: yield ', ' yield json.dumps(result, ensure_ascii=False) + '\n' if valid: yield ']' def dump_as_txt(results): for result in results: yield ( result.get('url', None) + '\t' + json.dumps(result.get('result', None), ensure_ascii=False) + '\n' ) def dump_as_csv(results): def toString(obj): if isinstance(obj, six.binary_type): if six.PY2: return obj else: return obj.decode('utf8') elif isinstance(obj, six.text_type): if six.PY2: return obj.encode('utf8') else: return obj else: if six.PY2: return json.dumps(obj, ensure_ascii=False).encode('utf8') else: return json.dumps(obj, ensure_ascii=False) # python2 needs byes when python3 needs unicode if six.PY2: stringio = BytesIO() else: stringio = StringIO() csv_writer = csv.writer(stringio) it = iter(results) first_30 = [] for result in it: first_30.append(result) if len(first_30) >= 30: break common_fields, _ = result_formater(first_30) common_fields_l = sorted(common_fields) csv_writer.writerow([toString('url')] + [toString(x) for x in common_fields_l] + [toString('...')]) for result in itertools.chain(first_30, it): result['result_formated'] = {} if not common_fields: result['others'] = result['result'] elif not isinstance(result['result'], dict): result['others'] = result['result'] else: result_formated = {} others = {} for key, value in iteritems(result['result']): if key in common_fields: result_formated[key] = value else: others[key] = value result['result_formated'] = result_formated result['others'] = others csv_writer.writerow( [toString(result['url'])] + [toString(result['result_formated'].get(k, '')) for k in common_fields_l] + [toString(result['others'])] ) yield stringio.getvalue() stringio.truncate(0) stringio.seek(0) PKd6[I-%weblocust/processor/project_module.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-16 22:24:20 # Modified on 2016-10-26 20:46:20 import os import six import sys import imp import time import weakref import logging import inspect import traceback import linecache from weblocust.libs import utils from weblocust.libs.log import SaveLogHandler, LogFormatter logger = logging.getLogger("processor") class ProjectManager(object): """ load projects from projectdb, update project """ CHECK_PROJECTS_INTERVAL = 5 * 60 RELOAD_PROJECT_INTERVAL = 60 * 60 @staticmethod def build_module(project, env={}): '''Build project script as module''' from weblocust.libs import base_handler assert 'name' in project, 'need name of project' assert 'script' in project, 'need script of project' # fix for old non-package version scripts weblocust_path = os.path.join(os.path.dirname(__file__), "..") if weblocust_path not in sys.path: sys.path.insert(1, weblocust_path) env = dict(env) env.update({ 'debug': project.get('status', 'DEBUG') == 'DEBUG', }) loader = ProjectLoader(project) module = loader.load_module(project['name']) # logger inject module.log_buffer = [] module.logging = module.logger = logging.Logger(project['name']) if env.get('enable_stdout_capture', True): handler = SaveLogHandler(module.log_buffer) handler.setFormatter(LogFormatter(color=False)) else: handler = logging.StreamHandler() handler.setFormatter(LogFormatter(color=True)) module.logger.addHandler(handler) if '__handler_cls__' not in module.__dict__: BaseHandler = module.__dict__.get('BaseHandler', base_handler.BaseHandler) for each in list(six.itervalues(module.__dict__)): if inspect.isclass(each) and each is not BaseHandler \ and issubclass(each, BaseHandler): module.__dict__['__handler_cls__'] = each _class = module.__dict__.get('__handler_cls__') assert _class is not None, "need BaseHandler in project module" instance = _class() instance.__env__ = env instance.project_name = project['name'] instance.project = project return { 'loader': loader, 'module': module, 'class': _class, 'instance': instance, 'exception': None, 'exception_log': '', 'info': project, 'load_time': time.time(), } def __init__(self, projectdb, env): self.projectdb = projectdb self.env = env self.projects = {} self.last_check_projects = time.time() def _need_update(self, project_name, updatetime=None, md5sum=None): '''Check if project_name need update''' if project_name not in self.projects: return True elif md5sum and md5sum != self.projects[project_name]['info'].get('md5sum'): return True elif updatetime and updatetime > self.projects[project_name]['info'].get('updatetime', 0): return True elif time.time() - self.projects[project_name]['load_time'] > self.RELOAD_PROJECT_INTERVAL: return True return False def _check_projects(self): '''Check projects by last update time''' for project in self.projectdb.check_update(self.last_check_projects, ['name', 'updatetime']): if project['name'] not in self.projects: continue if project['updatetime'] > self.projects[project['name']]['info'].get('updatetime', 0): self._update_project(project['name']) self.last_check_projects = time.time() def _update_project(self, project_name): '''Update one project from database''' project = self.projectdb.get(project_name) if not project: return None return self._load_project(project) def _load_project(self, project): '''Load project into self.projects from project info dict''' try: project['md5sum'] = utils.md5string(project['script']) ret = self.build_module(project, self.env) self.projects[project['name']] = ret except Exception as e: logger.exception("load project %s error", project.get('name', None)) ret = { 'loader': None, 'module': None, 'class': None, 'instance': None, 'exception': e, 'exception_log': traceback.format_exc(), 'info': project, 'load_time': time.time(), } self.projects[project['name']] = ret return False logger.debug('project: %s updated.', project.get('name', None)) return True def get(self, project_name, updatetime=None, md5sum=None): '''get project data object, return None if not exists''' if time.time() - self.last_check_projects > self.CHECK_PROJECTS_INTERVAL: self._check_projects() if self._need_update(project_name, updatetime, md5sum): self._update_project(project_name) return self.projects.get(project_name, None) class ProjectFinder(object): '''ProjectFinder class for sys.meta_path''' def __init__(self, projectdb): self.get_projectdb = weakref.ref(projectdb) @property def projectdb(self): return self.get_projectdb() def find_module(self, fullname, path=None): if fullname == 'projects': return self parts = fullname.split('.') if len(parts) == 2 and parts[0] == 'projects': name = parts[1] if not self.projectdb: return info = self.projectdb.get(name) if info: return ProjectLoader(info) def load_module(self, fullname): mod = imp.new_module(fullname) mod.__file__ = '' mod.__loader__ = self mod.__path__ = [''] mod.__package__ = 'projects' return mod def is_package(self, fullname): return True class ProjectLoader(object): '''ProjectLoader class for sys.meta_path''' def __init__(self, project, mod=None): self.project = project self.name = project['name'] self.mod = mod def load_module(self, fullname): if self.mod is None: self.mod = mod = imp.new_module(fullname) else: mod = self.mod mod.__file__ = '<%s>' % self.name mod.__loader__ = self mod.__project__ = self.project mod.__package__ = '' code = self.get_code(fullname) six.exec_(code, mod.__dict__) linecache.clearcache() return mod def is_package(self, fullname): return False def get_code(self, fullname): return compile(self.get_source(fullname), '<%s>' % self.name, 'exec') def get_source(self, fullname): script = self.project['script'] if isinstance(script, six.text_type): return script.encode('utf8') return script PKd6[Iq weblocust/processor/processor.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-16 22:59:56 # Modified on 2016-10-26 20:46:20 import sys import six import time import logging import traceback logger = logging.getLogger("processor") from six.moves import queue as Queue from weblocust.libs import utils from weblocust.libs.log import LogFormatter from weblocust.libs.utils import pretty_unicode, hide_me from weblocust.libs.response import rebuild_response from .project_module import ProjectManager, ProjectFinder class ProcessorResult(object): """The result and logs producted by a callback""" def __init__(self, result=None, follows=(), messages=(), logs=(), exception=None, extinfo={}, save=None): self.result = result self.follows = follows self.messages = messages self.logs = logs self.exception = exception self.extinfo = extinfo self.save = save def rethrow(self): """rethrow the exception""" if self.exception: raise self.exception def logstr(self): """handler the log records to formatted string""" result = [] formater = LogFormatter(color=False) for record in self.logs: if isinstance(record, six.string_types): result.append(pretty_unicode(record)) else: if record.exc_info: a, b, tb = record.exc_info tb = hide_me(tb, globals()) record.exc_info = a, b, tb result.append(pretty_unicode(formater.format(record))) result.append(u'\n') return u''.join(result) class Processor(object): PROCESS_TIME_LIMIT = 30 EXCEPTION_LIMIT = 3 RESULT_LOGS_LIMIT = 1000 RESULT_RESULT_LIMIT = 10 def __init__(self, projectdb, inqueue, status_queue, newtask_queue, result_queue, enable_stdout_capture=True, enable_projects_import=True): self.inqueue = inqueue self.status_queue = status_queue self.newtask_queue = newtask_queue self.result_queue = result_queue self.projectdb = projectdb self.enable_stdout_capture = enable_stdout_capture self._quit = False self._exceptions = 10 self.project_manager = ProjectManager(projectdb, dict( result_queue=self.result_queue, enable_stdout_capture=self.enable_stdout_capture, )) if enable_projects_import: self.enable_projects_import() def enable_projects_import(self): ''' Enable import other project as module `from project import project_name` ''' if six.PY2: sys.meta_path.append(ProjectFinder(self.projectdb)) def __del__(self): pass def on_task(self, task, response): '''Deal one task''' start_time = time.time() response = rebuild_response(response) try: assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('project_updatetime', None) md5sum = task.get('project_md5sum', None) project_data = self.project_manager.get(project, updatetime, md5sum) assert project_data, "no such project!" if project_data.get('exception'): ret = ProcessorResult(logs=(project_data.get('exception_log'), ), exception=project_data['exception']) else: ret = project_data['instance'].run_task( project_data['module'], task, response) except Exception as e: logstr = traceback.format_exc() ret = ProcessorResult(logs=(logstr, ), exception=e) process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ('etag', 'last-modified'): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'redirect_url': response.url if response.url != response.orig_url else None, 'time': response.time, 'error': response.error, 'status_code': response.status_code, 'encoding': response.encoding, 'headers': track_headers, 'content': response.text[:500] if ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': ( None if ret.result is None else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT] ), 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, 'save': ret.save, }, } if 'schedule' in task: status_pack['schedule'] = task['schedule'] # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. if ret.follows: for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)): self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in each]) for project, msg, url in ret.messages: try: self.on_task({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), }) except Exception as e: logger.exception('Sending message error.') continue if ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r follows:%d msg:%d error:%r' % ( task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True def quit(self): '''Set quit signal''' self._quit = True def run(self): '''Run loop''' logger.info("processor starting...") while not self._quit: try: task, response = self.inqueue.get(timeout=1) self.on_task(task, response) self._exceptions = 0 except Queue.Empty as e: continue except KeyboardInterrupt: break except Exception as e: logger.exception(e) self._exceptions += 1 if self._exceptions > self.EXCEPTION_LIMIT: break continue logger.info("processor exiting...") PKd6[IA22weblocust/processor/__init__.pyfrom .processor import ProcessorResult, Processor PKd6[I0g weblocust/scheduler/scheduler.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-07 17:05:11 # Modified on 2016-10-26 20:46:20 import itertools import json import logging import os import time from collections import deque from six import iteritems, itervalues from six.moves import queue as Queue from weblocust.libs import counter, utils from .task_queue import TaskQueue logger = logging.getLogger('scheduler') class Scheduler(object): UPDATE_PROJECT_INTERVAL = 5 * 60 default_schedule = { 'priority': 0, 'retries': 3, 'exetime': 0, 'age': 3*60,# every web page changes within 3 minutes 'itag': None, } LOOP_LIMIT = 1000 LOOP_INTERVAL = 0.1 ACTIVE_TASKS = 100 INQUEUE_LIMIT = 0 EXCEPTION_LIMIT = 3 DELETE_TIME = 10 * 60 DEFAULT_RETRY_DELAY = { 0: 30, 1: 1*60*60, 2: 6*60*60, 3: 12*60*60, '': 24*60*60 } def __init__(self, taskdb, projectdb, newtask_queue, status_queue, out_queue, data_path='./data', resultdb=None): self.taskdb = taskdb self.projectdb = projectdb self.resultdb = resultdb self.newtask_queue = newtask_queue self.status_queue = status_queue self.out_queue = out_queue self.data_path = data_path self._send_buffer = deque() self._quit = False self._exceptions = 0 self.projects = dict() self._force_update_project = False self._last_update_project = 0 self.task_queue = dict() self._last_tick = int(time.time()) self._sent_finished_event = dict() self._cnt = { "5m_time": counter.CounterManager( lambda: counter.TimebaseAverageEventCounter(30, 10)), "5m": counter.CounterManager( lambda: counter.TimebaseAverageWindowCounter(30, 10)), "1h": counter.CounterManager( lambda: counter.TimebaseAverageWindowCounter(60, 60)), "1d": counter.CounterManager( lambda: counter.TimebaseAverageWindowCounter(10 * 60, 24 * 6)), "all": counter.CounterManager( lambda: counter.TotalCounter()), } self._cnt['1h'].load(os.path.join(self.data_path, 'scheduler.1h')) self._cnt['1d'].load(os.path.join(self.data_path, 'scheduler.1d')) self._cnt['all'].load(os.path.join(self.data_path, 'scheduler.all')) self._last_dump_cnt = 0 def _update_projects(self): '''Check project update''' now = time.time() if ( not self._force_update_project and self._last_update_project + self.UPDATE_PROJECT_INTERVAL > now #就是说没有强制更新,并且没有到更新时间就跳过 ): return for project in self.projectdb.check_update(self._last_update_project): self._update_project(project) logger.debug("project: %s updated.", project['name']) self._force_update_project = False self._last_update_project = now def _update_project(self, project): '''update one project''' if project['name'] not in self.projects: self.projects[project['name']] = {} self.projects[project['name']].update(project) self.projects[project['name']]['md5sum'] = utils.md5string(project['script']) if not self.projects[project['name']].get('active_tasks', None): self.projects[project['name']]['active_tasks'] = deque(maxlen=self.ACTIVE_TASKS) # load task queue when project is running and delete task_queue when project is stoped if project['status'] in ('RUNNING', 'DEBUG'): if project['name'] not in self.task_queue: self._load_tasks(project['name']) self.task_queue[project['name']].rate = project['rate'] self.task_queue[project['name']].burst = project['burst'] # update project runtime info from processor by sending a _on_get_info # request, result is in status_page.track.save self.on_select_task({ 'taskid': '_on_get_info', 'project': project['name'], 'url': 'data:,_on_get_info', 'status': self.taskdb.SUCCESS, 'fetch': { 'save': ['min_tick', 'retry_delay'], }, 'process': { 'callback': '_on_get_info', }, }) else: if project['name'] in self.task_queue: self.task_queue[project['name']].rate = 0 self.task_queue[project['name']].burst = 0 del self.task_queue[project['name']] if project not in self._cnt['all']: self._update_project_cnt(project['name']) scheduler_task_fields = ['taskid', 'project', 'schedule', ] def _load_tasks(self, project): '''load tasks from database''' self.task_queue[project] = TaskQueue(rate=0, burst=0) for task in self.taskdb.load_tasks( self.taskdb.ACTIVE, project, self.scheduler_task_fields ): taskid = task['taskid'] _schedule = task.get('schedule', self.default_schedule) priority = _schedule.get('priority', self.default_schedule['priority']) exetime = _schedule.get('exetime', self.default_schedule['exetime']) self.task_queue[project].put(taskid, priority, exetime) logger.debug('project: %s loaded %d tasks.', project, len(self.task_queue[project])) if self.projects[project]['status'] in ('RUNNING', 'DEBUG'): self.task_queue[project].rate = self.projects[project]['rate'] self.task_queue[project].burst = self.projects[project]['burst'] else: self.task_queue[project].rate = 0 self.task_queue[project].burst = 0 if project not in self._cnt['all']: self._update_project_cnt(project) self._cnt['all'].value((project, 'pending'), len(self.task_queue[project])) def _update_project_cnt(self, project): status_count = self.taskdb.status_count(project) self._cnt['all'].value( (project, 'success'), status_count.get(self.taskdb.SUCCESS, 0) ) self._cnt['all'].value( (project, 'failed'), status_count.get(self.taskdb.FAILED, 0) + status_count.get(self.taskdb.BAD, 0) ) self._cnt['all'].value( (project, 'pending'), status_count.get(self.taskdb.ACTIVE, 0) ) def task_verify(self, task): ''' return False if any of 'taskid', 'project', 'url' is not in task dict or project in not in task_queue ''' for each in ('taskid', 'project', 'url', ): if each not in task or not task[each]: logger.error('%s not in task: %.200r', each, task) return False if task['project'] not in self.task_queue: if task['project'] in self.projects: logger.error('project %s not started, please set status to RUNNING or DEBUG', task['project']) else: logger.error('unknown project: %s', task['project']) return False return True def insert_task(self, task): '''insert task into database''' return self.taskdb.insert(task['project'], task['taskid'], task) def update_task(self, task): '''update task in database''' return self.taskdb.update(task['project'], task['taskid'], task) def put_task(self, task): '''put task to task queue''' _schedule = task.get('schedule', self.default_schedule) self.task_queue[task['project']].put( task['taskid'], priority=_schedule.get('priority', self.default_schedule['priority']), exetime=_schedule.get('exetime', self.default_schedule['exetime']) ) def send_task(self, task, force=True): ''' dispatch task to fetcher out queue may have size limit to prevent block, a send_buffer is used ''' try: self.out_queue.put_nowait(task) except Queue.Full: if force: self._send_buffer.appendleft(task) else: raise def _check_task_done(self): '''Check status queue''' cnt = 0 try: while True: task = self.status_queue.get_nowait() # check _on_get_info result here if task.get('taskid') == '_on_get_info' and 'project' in task and 'track' in task: if task['project'] not in self.projects: continue self.projects[task['project']].update(task['track'].get('save') or {}) logger.info( '%s on_get_info %r', task['project'], task['track'].get('save', {}) ) continue elif not self.task_verify(task): continue self.on_task_status(task) cnt += 1 except Queue.Empty: pass return cnt merge_task_fields = ['taskid', 'project', 'url', 'status', 'schedule', 'lastcrawltime'] def _check_request(self): '''Check new task queue''' tasks = {} while len(tasks) < self.LOOP_LIMIT: try: task = self.newtask_queue.get_nowait() except Queue.Empty: break if isinstance(task, list): _tasks = task else: _tasks = (task, ) for task in _tasks: if not self.task_verify(task): continue if task['taskid'] in self.task_queue[task['project']]: if not task.get('schedule', {}).get('force_update', False): logger.debug('ignore newtask %(project)s:%(taskid)s %(url)s', task) continue if task['taskid'] in tasks: if not task.get('schedule', {}).get('force_update', False): continue tasks[task['taskid']] = task for task in itervalues(tasks): self.on_request(task) return len(tasks) def _check_cronjob(self): """Check projects cronjob tick, return True when a new tick is sended""" now = time.time() self._last_tick = int(self._last_tick) if now - self._last_tick < 1: return False self._last_tick += 1 for project in itervalues(self.projects): if project['status'] not in ('DEBUG', 'RUNNING'): continue if project.get('min_tick', 0) == 0: continue if self._last_tick % int(project['min_tick']) != 0: continue self.on_select_task({ 'taskid': '_on_cronjob', 'project': project['name'], 'url': 'data:,_on_cronjob', 'status': self.taskdb.SUCCESS, 'fetch': { 'save': { 'tick': self._last_tick, }, }, 'process': { 'callback': '_on_cronjob', }, }) return True request_task_fields = [ 'taskid', 'project', 'url', 'status', 'schedule', 'fetch', 'process', 'track', 'lastcrawltime' ] def _check_select(self): '''Select task to fetch & process''' while self._send_buffer: _task = self._send_buffer.pop() try: # use force=False here to prevent automatic send_buffer append and get exception self.send_task(_task, False) except Queue.Full: self._send_buffer.append(_task) break if self.out_queue.full(): return {} taskids = [] cnt = 0 cnt_dict = dict() limit = self.LOOP_LIMIT for project, task_queue in iteritems(self.task_queue): if cnt >= limit: break # task queue self.task_queue[project].check_update() project_cnt = 0 # check send_buffer here. when not empty, out_queue may blocked. Not sending tasks while cnt < limit and project_cnt < limit / 10: taskid = task_queue.get() if not taskid: break taskids.append((project, taskid)) project_cnt += 1 cnt += 1 cnt_dict[project] = project_cnt if project_cnt: self._sent_finished_event[project] = 'need' # check and send finished event to project elif len(task_queue) == 0 and self._sent_finished_event.get(project) == 'need': self._sent_finished_event[project] = 'sent' self.on_select_task({ 'taskid': 'on_finished', 'project': project, 'url': 'data:,on_finished', 'status': self.taskdb.SUCCESS, 'process': { 'callback': 'on_finished', }, }) for project, taskid in taskids: self._load_put_task(project, taskid) return cnt_dict def _load_put_task(self, project, taskid): try: task = self.taskdb.get_task(project, taskid, fields=self.request_task_fields) except ValueError: logger.error('bad task pack %s:%s', project, taskid) return if not task: return task = self.on_select_task(task) def _print_counter_log(self): # print top 5 active counters keywords = ('pending', 'success', 'retry', 'failed') total_cnt = {} project_actives = [] project_fails = [] for key in keywords: total_cnt[key] = 0 for project, subcounter in iteritems(self._cnt['5m']): actives = 0 for key in keywords: cnt = subcounter.get(key, None) if cnt: cnt = cnt.sum total_cnt[key] += cnt actives += cnt project_actives.append((actives, project)) fails = subcounter.get('failed', None) if fails: project_fails.append((fails.sum, project)) top_2_fails = sorted(project_fails, reverse=True)[:2] top_3_actives = sorted([x for x in project_actives if x[1] not in top_2_fails], reverse=True)[:5 - len(top_2_fails)] log_str = ("in 5m: new:%(pending)d,success:%(success)d," "retry:%(retry)d,failed:%(failed)d" % total_cnt) for _, project in itertools.chain(top_3_actives, top_2_fails): subcounter = self._cnt['5m'][project].to_dict(get_value='sum') log_str += " %s:%d,%d,%d,%d" % (project, subcounter.get('pending', 0), subcounter.get('success', 0), subcounter.get('retry', 0), subcounter.get('failed', 0)) logger.info(log_str) def _dump_cnt(self): '''Dump counters to file''' self._cnt['1h'].dump(os.path.join(self.data_path, 'scheduler.1h')) self._cnt['1d'].dump(os.path.join(self.data_path, 'scheduler.1d')) self._cnt['all'].dump(os.path.join(self.data_path, 'scheduler.all')) def _try_dump_cnt(self): '''Dump counters every 60 seconds''' now = time.time() if now - self._last_dump_cnt > 60: self._last_dump_cnt = now self._dump_cnt() self._print_counter_log() def _check_delete(self): '''Check project delete''' now = time.time() for project in list(itervalues(self.projects)): if project['status'] != 'STOP': continue if now - project['updatetime'] < self.DELETE_TIME: continue if 'delete' not in self.projectdb.split_group(project['group']): continue logger.warning("deleting project: %s!", project['name']) if project['name'] in self.task_queue: self.task_queue[project['name']].rate = 0 self.task_queue[project['name']].burst = 0 del self.task_queue[project['name']] del self.projects[project['name']] self.taskdb.drop(project['name']) self.projectdb.drop(project['name']) if self.resultdb: self.resultdb.drop(project['name']) for each in self._cnt.values(): del each[project['name']] def __len__(self): return sum(len(x) for x in itervalues(self.task_queue)) def quit(self): '''Set quit signal''' self._quit = True # stop xmlrpc server if hasattr(self, 'xmlrpc_server'): self.xmlrpc_ioloop.add_callback(self.xmlrpc_server.stop) self.xmlrpc_ioloop.add_callback(self.xmlrpc_ioloop.stop) def run_once(self): '''comsume queues and feed tasks to fetcher, once''' self._update_projects() self._check_task_done() self._check_request() while self._check_cronjob(): pass self._check_select() self._check_delete() self._try_dump_cnt() def run(self): '''Start scheduler loop''' logger.info("loading projects") while not self._quit: try: time.sleep(self.LOOP_INTERVAL) self.run_once() self._exceptions = 0 except KeyboardInterrupt: break except Exception as e: logger.exception(e) self._exceptions += 1 if self._exceptions > self.EXCEPTION_LIMIT: break continue logger.info("scheduler exiting...") self._dump_cnt() def trigger_on_start(self, project): '''trigger an on_start callback of project''' self.newtask_queue.put({ "project": project, "taskid": "on_start", "url": "data:,on_start", "process": { "callback": "on_start", }, }) def xmlrpc_run(self, port=23333, bind='127.0.0.1', logRequests=False): '''Start xmlrpc interface''' from weblocust.libs.wsgi_xmlrpc import WSGIXMLRPCApplication application = WSGIXMLRPCApplication() application.register_function(self.quit, '_quit') application.register_function(self.__len__, 'size') def dump_counter(_time, _type): try: return self._cnt[_time].to_dict(_type) except: logger.exception('') application.register_function(dump_counter, 'counter') def new_task(task): if self.task_verify(task): self.newtask_queue.put(task) return True return False application.register_function(new_task, 'newtask') def send_task(task): '''dispatch task to fetcher''' self.send_task(task) return True application.register_function(send_task, 'send_task') def update_project(): self._force_update_project = True application.register_function(update_project, 'update_project') def get_active_tasks(project=None, limit=100): allowed_keys = set(( 'taskid', 'project', 'status', 'url', 'lastcrawltime', 'updatetime', 'track', )) track_allowed_keys = set(( 'ok', 'time', 'follows', 'status_code', )) iters = [iter(x['active_tasks']) for k, x in iteritems(self.projects) if x and (k == project if project else True)] tasks = [next(x, None) for x in iters] result = [] while len(result) < limit and tasks and not all(x is None for x in tasks): updatetime, task = t = max(t for t in tasks if t) i = tasks.index(t) tasks[i] = next(iters[i], None) for key in list(task): if key == 'track': for k in list(task[key].get('fetch', [])): if k not in track_allowed_keys: del task[key]['fetch'][k] for k in list(task[key].get('process', [])): if k not in track_allowed_keys: del task[key]['process'][k] if key in allowed_keys: continue del task[key] result.append(t) # fix for ":dictionary key must be string" # have no idea why return json.loads(json.dumps(result)) application.register_function(get_active_tasks, 'get_active_tasks') import tornado.wsgi import tornado.ioloop import tornado.httpserver container = tornado.wsgi.WSGIContainer(application) self.xmlrpc_ioloop = tornado.ioloop.IOLoop() self.xmlrpc_server = tornado.httpserver.HTTPServer(container, io_loop=self.xmlrpc_ioloop) self.xmlrpc_server.listen(port=port, address=bind) self.xmlrpc_ioloop.start() def on_request(self, task): if self.INQUEUE_LIMIT and len(self.task_queue[task['project']]) >= self.INQUEUE_LIMIT: logger.debug('overflow task %(project)s:%(taskid)s %(url)s', task) return oldtask = self.taskdb.get_task(task['project'], task['taskid'], fields=self.merge_task_fields) if oldtask: return self.on_old_request(task, oldtask) else: return self.on_new_request(task) def on_new_request(self, task): '''Called when a new request is arrived''' task['status'] = self.taskdb.ACTIVE self.insert_task(task) self.put_task(task) project = task['project'] self._cnt['5m'].event((project, 'pending'), +1) self._cnt['1h'].event((project, 'pending'), +1) self._cnt['1d'].event((project, 'pending'), +1) self._cnt['all'].event((project, 'pending'), +1) logger.info('new task %(project)s:%(taskid)s %(url)s', task) return task def on_old_request(self, task, old_task): '''Called when a crawled task is arrived''' now = time.time() _schedule = task.get('schedule', self.default_schedule) old_schedule = old_task.get('schedule', {}) restart = False schedule_age = _schedule.get('age', self.default_schedule['age']) if _schedule.get('itag') and _schedule['itag'] != old_schedule.get('itag'): restart = True elif schedule_age >= 0 and schedule_age + (old_task.get('lastcrawltime', 0) or 0) < now: restart = True elif _schedule.get('force_update'): restart = True if not restart: logger.debug('ignore newtask %(project)s:%(taskid)s %(url)s', task) return task['status'] = self.taskdb.ACTIVE self.update_task(task) self.put_task(task) project = task['project'] if old_task['status'] != self.taskdb.ACTIVE: self._cnt['5m'].event((project, 'pending'), +1) self._cnt['1h'].event((project, 'pending'), +1) self._cnt['1d'].event((project, 'pending'), +1) if old_task['status'] == self.taskdb.SUCCESS: self._cnt['all'].event((project, 'success'), -1).event((project, 'pending'), +1) elif old_task['status'] == self.taskdb.FAILED: self._cnt['all'].event((project, 'failed'), -1).event((project, 'pending'), +1) logger.info('restart task %(project)s:%(taskid)s %(url)s', task) return task def on_task_status(self, task): '''Called when a status pack is arrived''' try: procesok = task['track']['process']['ok'] if not self.task_queue[task['project']].done(task['taskid']): logging.error('not processing pack: %(project)s:%(taskid)s %(url)s', task) return None except KeyError as e: logger.error("Bad status pack: %s", e) return None if procesok: ret = self.on_task_done(task) else: ret = self.on_task_failed(task) if task['track']['fetch'].get('time'): self._cnt['5m_time'].event((task['project'], 'fetch_time'), task['track']['fetch']['time']) if task['track']['process'].get('time'): self._cnt['5m_time'].event((task['project'], 'process_time'), task['track']['process'].get('time')) self.projects[task['project']]['active_tasks'].appendleft((time.time(), task)) return ret def on_task_done(self, task): '''Called when a task is done and success, called by `on_task_status`''' task['status'] = self.taskdb.SUCCESS task['lastcrawltime'] = time.time() if 'schedule' in task: if task['schedule'].get('auto_recrawl') and 'age' in task['schedule']: task['status'] = self.taskdb.ACTIVE next_exetime = task['schedule'].get('age') task['schedule']['exetime'] = time.time() + next_exetime self.put_task(task) else: del task['schedule'] self.update_task(task) project = task['project'] self._cnt['5m'].event((project, 'success'), +1) self._cnt['1h'].event((project, 'success'), +1) self._cnt['1d'].event((project, 'success'), +1) self._cnt['all'].event((project, 'success'), +1).event((project, 'pending'), -1) logger.info('task done %(project)s:%(taskid)s %(url)s', task) return task def on_task_failed(self, task): '''Called when a task is failed, called by `on_task_status`''' if 'schedule' not in task: old_task = self.taskdb.get_task(task['project'], task['taskid'], fields=['schedule']) if old_task is None: logging.error('unknown status pack: %s' % task) return task['schedule'] = old_task.get('schedule', {}) retries = task['schedule'].get('retries', self.default_schedule['retries']) retried = task['schedule'].get('retried', 0) project_info = self.projects.get(task['project'], {}) retry_delay = project_info.get('retry_delay', None) or self.DEFAULT_RETRY_DELAY next_exetime = retry_delay.get(retried, retry_delay.get('', self.DEFAULT_RETRY_DELAY[''])) if task['schedule'].get('auto_recrawl') and 'age' in task['schedule']: next_exetime = min(next_exetime, task['schedule'].get('age')) else: if retried >= retries: next_exetime = -1 elif 'age' in task['schedule'] and next_exetime > task['schedule'].get('age'): next_exetime = task['schedule'].get('age') if next_exetime < 0: task['status'] = self.taskdb.FAILED task['lastcrawltime'] = time.time() self.update_task(task) project = task['project'] self._cnt['5m'].event((project, 'failed'), +1) self._cnt['1h'].event((project, 'failed'), +1) self._cnt['1d'].event((project, 'failed'), +1) self._cnt['all'].event((project, 'failed'), +1).event((project, 'pending'), -1) logger.info('task failed %(project)s:%(taskid)s %(url)s' % task) return task else: task['schedule']['retried'] = retried + 1 task['schedule']['exetime'] = time.time() + next_exetime task['lastcrawltime'] = time.time() self.update_task(task) self.put_task(task) project = task['project'] self._cnt['5m'].event((project, 'retry'), +1) self._cnt['1h'].event((project, 'retry'), +1) self._cnt['1d'].event((project, 'retry'), +1) # self._cnt['all'].event((project, 'retry'), +1) logger.info('task retry %d/%d %%(project)s:%%(taskid)s %%(url)s' % ( retried, retries), task) return task def on_select_task(self, task): '''Called when a task is selected to fetch & process''' # inject informations about project logger.info('select %(project)s:%(taskid)s %(url)s', task) project_info = self.projects.get(task['project']) assert project_info, 'no such project' task['group'] = project_info.get('group') task['project_md5sum'] = project_info.get('md5sum') task['project_updatetime'] = project_info.get('updatetime', 0) project_info['active_tasks'].appendleft((time.time(), task)) self.send_task(task) return task from tornado import gen class OneScheduler(Scheduler): """ Scheduler Mixin class for one mode overwirted send_task method call processor.on_task(fetcher.fetch(task)) instead of consuming queue """ def _check_select(self): """ interactive mode of select tasks """ if not self.interactive: return super(OneScheduler, self)._check_select() # waiting for running tasks if self.running_task > 0: return is_crawled = [] def run(project=None): return crawl('on_start', project=project) def crawl(url, project=None, **kwargs): """ Crawl given url, same parameters as BaseHandler.crawl url - url or taskid, parameters will be used if in taskdb project - can be ignored if only one project exists. """ # looking up the project instance if project is None: if len(self.projects) == 1: project = list(self.projects.keys())[0] else: raise LookupError('You need specify the project: %r' % list(self.projects.keys())) project_data = self.processor.project_manager.get(project) if not project_data: raise LookupError('no such project: %s' % project) # get task package instance = project_data['instance'] instance._reset() task = instance.crawl(url, **kwargs) if isinstance(task, list): raise Exception('url list is not allowed in interactive mode') # check task in taskdb if not kwargs: dbtask = self.taskdb.get_task(task['project'], task['taskid'], fields=self.request_task_fields) if not dbtask: dbtask = self.taskdb.get_task(task['project'], task['url'], fields=self.request_task_fields) if dbtask: task = dbtask # select the task self.on_select_task(task) is_crawled.append(True) shell.ask_exit() def quit_interactive(): '''Quit interactive mode''' is_crawled.append(True) self.interactive = False shell.ask_exit() def quit_weblocust(): '''Close weblocust''' is_crawled[:] = [] shell.ask_exit() shell = utils.get_python_console() shell.interact( 'weblocust shell - Select task\n' 'crawl(url, project=None, **kwargs) - same parameters as BaseHandler.crawl\n' 'quit_interactive() - Quit interactive mode\n' 'quit_weblocust() - Close weblocust' ) if not is_crawled: self.ioloop.add_callback(self.ioloop.stop) def __getattr__(self, name): """patch for crawl(url, callback=self.index_page) API""" if self.interactive: return name raise AttributeError(name) def on_task_status(self, task): """Ignore not processing error in interactive mode""" if not self.interactive: super(OneScheduler, self).on_task_status(task) try: procesok = task['track']['process']['ok'] except KeyError as e: logger.error("Bad status pack: %s", e) return None if procesok: ret = self.on_task_done(task) else: ret = self.on_task_failed(task) if task['track']['fetch'].get('time'): self._cnt['5m_time'].event((task['project'], 'fetch_time'), task['track']['fetch']['time']) if task['track']['process'].get('time'): self._cnt['5m_time'].event((task['project'], 'process_time'), task['track']['process'].get('time')) self.projects[task['project']]['active_tasks'].appendleft((time.time(), task)) return ret def init_one(self, ioloop, fetcher, processor, result_worker=None, interactive=False): self.ioloop = ioloop self.fetcher = fetcher self.processor = processor self.result_worker = result_worker self.interactive = interactive self.running_task = 0 @gen.coroutine def do_task(self, task): self.running_task += 1 result = yield gen.Task(self.fetcher.fetch, task) type, task, response = result.args self.processor.on_task(task, response) # do with message while not self.processor.inqueue.empty(): _task, _response = self.processor.inqueue.get() self.processor.on_task(_task, _response) # do with results while not self.processor.result_queue.empty(): _task, _result = self.processor.result_queue.get() if self.result_worker: self.result_worker.on_result(_task, _result) self.running_task -= 1 def send_task(self, task, force=True): if self.fetcher.http_client.free_size() <= 0: if force: self._send_buffer.appendleft(task) else: raise self.outqueue.Full self.ioloop.add_future(self.do_task(task), lambda x: x.result()) def run(self): import tornado.ioloop tornado.ioloop.PeriodicCallback(self.run_once, 100, io_loop=self.ioloop).start() self.ioloop.start() def quit(self): self.ioloop.stop() logger.info("scheduler exiting...") import random import threading class ThreadBaseScheduler(Scheduler): def __init__(self, threads=4, *args, **kwargs): self.threads = threads self.local = threading.local() super(ThreadBaseScheduler, self).__init__(*args, **kwargs) self._taskdb = self.taskdb self._projectdb = self.projectdb self._resultdb = self.resultdb self.thread_objs = [] self.thread_queues = [] self._start_threads() assert len(self.thread_queues) > 0 @property def taskdb(self): if not hasattr(self.local, 'taskdb'): self.taskdb = self._taskdb.copy() return self.local.taskdb @taskdb.setter def taskdb(self, taskdb): self.local.taskdb = taskdb @property def projectdb(self): if not hasattr(self.local, 'projectdb'): self.projectdb = self._projectdb.copy() return self.local.projectdb @projectdb.setter def projectdb(self, projectdb): self.local.projectdb = projectdb @property def resultdb(self): if not hasattr(self.local, 'resultdb'): self.resultdb = self._resultdb.copy() return self.local.resultdb @resultdb.setter def resultdb(self, resultdb): self.local.resultdb = resultdb def _start_threads(self): for i in range(self.threads): queue = Queue.Queue() thread = threading.Thread(target=self._thread_worker, args=(queue, )) thread.daemon = True thread.start() self.thread_objs.append(thread) self.thread_queues.append(queue) def _thread_worker(self, queue): while True: method, args, kwargs = queue.get() try: method(*args, **kwargs) except Exception as e: logger.exception(e) def _run_in_thread(self, method, *args, **kwargs): i = kwargs.pop('_i', None) block = kwargs.pop('_block', False) if i is None: while True: for queue in self.thread_queues: if queue.empty(): break else: if block: time.sleep(0.1) continue else: queue = self.thread_queues[random.randint(0, len(self.thread_queues)-1)] break else: queue = self.thread_queues[i % len(self.thread_queues)] queue.put((method, args, kwargs)) if block: self._wait_thread() def _wait_thread(self): while True: if all(queue.empty() for queue in self.thread_queues): break time.sleep(0.1) def _update_project(self, project): self._run_in_thread(Scheduler._update_project, self, project) def on_task_status(self, task): i = hash(task['taskid']) self._run_in_thread(Scheduler.on_task_status, self, task, _i=i) def on_request(self, task): i = hash(task['taskid']) self._run_in_thread(Scheduler.on_request, self, task, _i=i) def _load_put_task(self, project, taskid): i = hash(taskid) self._run_in_thread(Scheduler._load_put_task, self, project, taskid, _i=i) def run_once(self): super(ThreadBaseScheduler, self).run_once() self._wait_thread() PKd6[If#weblocust/scheduler/token_bucket.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-07 16:53:08 # Modified on 2016-10-26 20:46:20 import time try: import threading as _threading except ImportError: import dummy_threading as _threading class Bucket(object): ''' traffic flow control with token bucket ''' update_interval = 30 def __init__(self, rate=1, burst=None): self.rate = float(rate) if burst is None: self.burst = float(rate) * 10 else: self.burst = float(burst) self.mutex = _threading.Lock() self.bucket = self.burst self.last_update = time.time() def get(self): '''Get the number of tokens in bucket''' now = time.time() if self.bucket >= self.burst: self.last_update = now return self.bucket bucket = self.rate * (now - self.last_update) self.mutex.acquire() if bucket > 1: self.bucket += bucket if self.bucket > self.burst: self.bucket = self.burst self.last_update = now self.mutex.release() return self.bucket def set(self, value): '''Set number of tokens in bucket''' self.bucket = value def desc(self, value=1): '''Use value tokens''' self.bucket -= value PKd6[I1*LLweblocust/scheduler/__init__.pyfrom .scheduler import Scheduler, OneScheduler, ThreadBaseScheduler # NOQA PKd6[IKُ!weblocust/scheduler/task_queue.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-07 13:12:10 # Modified on 2016-10-26 20:46:20 import time import heapq import logging import threading try: from UserDict import DictMixin except ImportError: from collections import Mapping as DictMixin from .token_bucket import Bucket from six.moves import queue as Queue logger = logging.getLogger('scheduler') try: cmp except NameError: cmp = lambda x, y: (x > y) - (x < y) class InQueueTask(DictMixin): __slots__ = ('taskid', 'priority', 'exetime') __getitem__ = lambda *x: getattr(*x) __setitem__ = lambda *x: setattr(*x) __iter__ = lambda self: iter(self.__slots__) __len__ = lambda self: len(self.__slots__) keys = lambda self: self.__slots__ def __init__(self, taskid, priority=0, exetime=0): self.taskid = taskid self.priority = priority self.exetime = exetime def __cmp__(self, other): if self.exetime == 0 and other.exetime == 0: return -cmp(self.priority, other.priority) else: return cmp(self.exetime, other.exetime) def __lt__(self, other): return self.__cmp__(other) < 0 class PriorityTaskQueue(Queue.Queue): ''' TaskQueue Same taskid items will been merged ''' def _init(self, maxsize): self.queue = [] self.queue_dict = dict() def _qsize(self, len=len): return len(self.queue_dict) def _put(self, item, heappush=heapq.heappush): if item.taskid in self.queue_dict: task = self.queue_dict[item.taskid] changed = False if item.priority > task.priority: task.priority = item.priority changed = True if item.exetime < task.exetime: task.exetime = item.exetime changed = True if changed: self._resort() else: heappush(self.queue, item) self.queue_dict[item.taskid] = item def _get(self, heappop=heapq.heappop): while self.queue: item = heappop(self.queue) if item.taskid is None: continue self.queue_dict.pop(item.taskid, None) return item return None @property def top(self): while self.queue and self.queue[0].taskid is None: heapq.heappop(self.queue) if self.queue: return self.queue[0] return None def _resort(self): heapq.heapify(self.queue) def __contains__(self, taskid): return taskid in self.queue_dict def __getitem__(self, taskid): return self.queue_dict[taskid] def __setitem__(self, taskid, item): assert item.taskid == taskid self.put(item) def __delitem__(self, taskid): self.queue_dict.pop(taskid).taskid = None class TaskQueue(object): ''' task queue for scheduler, have a priority queue and a time queue for delayed tasks ''' processing_timeout = 10 * 60 def __init__(self, rate=0, burst=0): self.mutex = threading.RLock() self.priority_queue = PriorityTaskQueue() self.time_queue = PriorityTaskQueue() self.processing = PriorityTaskQueue() self.bucket = Bucket(rate=rate, burst=burst) @property def rate(self): return self.bucket.rate @rate.setter def rate(self, value): self.bucket.rate = value @property def burst(self): return self.burst.burst @burst.setter def burst(self, value): self.bucket.burst = value def check_update(self): ''' Check time queue and processing queue put tasks to priority queue when execute time arrived or process timeout ''' self._check_time_queue() self._check_processing() def _check_time_queue(self): now = time.time() self.mutex.acquire() while self.time_queue.qsize() and self.time_queue.top and self.time_queue.top.exetime < now: task = self.time_queue.get_nowait() task.exetime = 0 self.priority_queue.put(task) self.mutex.release() def _check_processing(self): now = time.time() self.mutex.acquire() while self.processing.qsize() and self.processing.top and self.processing.top.exetime < now: task = self.processing.get_nowait() if task.taskid is None: continue task.exetime = 0 self.priority_queue.put(task) logger.info("processing: retry %s", task.taskid) self.mutex.release() def put(self, taskid, priority=0, exetime=0): '''Put a task into task queue''' now = time.time() task = InQueueTask(taskid, priority, exetime) self.mutex.acquire() if taskid in self.priority_queue: self.priority_queue.put(task) elif taskid in self.time_queue: self.time_queue.put(task) elif taskid in self.processing and self.processing[taskid].taskid: # force update a processing task is not allowed as there are so many # problems may happen pass else: if exetime and exetime > now: self.time_queue.put(task) else: self.priority_queue.put(task) self.mutex.release() def get(self): '''Get a task from queue when bucket available''' if self.bucket.get() < 1: return None now = time.time() self.mutex.acquire() try: task = self.priority_queue.get_nowait() self.bucket.desc() except Queue.Empty: self.mutex.release() return None task.exetime = now + self.processing_timeout self.processing.put(task) self.mutex.release() return task.taskid def done(self, taskid): '''Mark task done''' if taskid in self.processing: self.mutex.acquire() if taskid in self.processing: del self.processing[taskid] self.mutex.release() return True return False def size(self): return self.priority_queue.qsize() + self.time_queue.qsize() + self.processing.qsize() def __len__(self): return self.size() def __contains__(self, taskid): if taskid in self.priority_queue or taskid in self.time_queue: return True if taskid in self.processing and self.processing[taskid].taskid: return True return False if __name__ == '__main__': task_queue = TaskQueue() task_queue.processing_timeout = 0.1 task_queue.put('a3', 3, time.time() + 0.1) task_queue.put('a1', 1) task_queue.put('a2', 2) assert task_queue.get() == 'a2' time.sleep(0.1) task_queue._check_time_queue() assert task_queue.get() == 'a3' assert task_queue.get() == 'a1' task_queue._check_processing() assert task_queue.get() == 'a2' assert len(task_queue) == 0 PKd6[I_FF$weblocust/message_queue/beanstalk.py#!/usr/bin/env python # coding:utf-8 """beanstalk queue - queue based on beanstalk Setting: you need to set max-job-size bigger(default 65535) DAEMON_OPTS="-l $BEANSTALKD_LISTEN_ADDR -p $BEANSTALKD_LISTEN_PORT -z 524288" """ import time import umsgpack import beanstalkc import threading import logging from six.moves import queue as BaseQueue class BeanstalkQueue(object): max_timeout = 0.3 Empty = BaseQueue.Empty Full = BaseQueue.Full def __init__(self, name, host='localhost:11300', maxsize=0): """ Constructor for a BeanstalkdQueue. """ self.name = name config = host.split(':') self.host = config[0] if len(config) else 'localhost' self.port = int(config[1]) if len(config) > 1 else 11300 self.lock = threading.RLock() self.maxsize = maxsize self.reconnect() def stats(self): try: with self.lock: stats = self.connection.stats_tube(self.name) except beanstalkc.CommandFailed as err: # tube is empty if err[1] == 'NOT_FOUND': return {} stats = [item.split(': ') for item in stats.split('\n') if item.find(':')] stats = [(item[0], item[1]) for item in stats if len(item) == 2] return dict(stats) def reconnect(self): self.connection = beanstalkc.Connection(host=self.host, port=self.port, parse_yaml=False) self.connection.use(self.name) self.connection.watch(self.name) def qsize(self): stats = self.stats() return int(stats.get('current-jobs-ready', 0)) def empty(self): if self.qsize() == 0: return True else: return False def full(self): if self.maxsize and self.qsize() >= self.maxsize: return True else: return False def put(self, obj, block=True, timeout=None): if not block: return self.put_nowait(obj) start_time = time.time() while True: try: return self.put_nowait(obj) except BaseQueue.Full: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) def put_nowait(self, obj): if self.full(): raise BaseQueue.Full with self.lock: return self.connection.put(umsgpack.packb(obj)) def get(self, block=True, timeout=None): if not block: return self.get_nowait() start_time = time.time() while True: try: return self.get_nowait() except BaseQueue.Empty: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) def get_nowait(self): try: with self.lock: job = self.connection.reserve(0) if not job: raise BaseQueue.Empty else: body = umsgpack.unpackb(job.body) job.delete() return body except beanstalkc.DeadlineSoon: raise BaseQueue.Empty Queue = BeanstalkQueue PKd6[I;$k"k"#weblocust/message_queue/rabbitmq.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux<17175297.hk@gmail.com> # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2012-11-15 17:27:54 # Modified on 2016-10-26 20:46:20 import time import socket import select import logging import umsgpack import threading import amqp from six.moves.urllib.parse import unquote try: from urllib import parse as urlparse except ImportError: import urlparse from six.moves import queue as BaseQueue def catch_error(func): """Catch errors of rabbitmq then reconnect""" import amqp try: import pika.exceptions connect_exceptions = ( pika.exceptions.ConnectionClosed, pika.exceptions.AMQPConnectionError, ) except ImportError: connect_exceptions = () connect_exceptions += ( select.error, socket.error, amqp.ConnectionError ) def wrap(self, *args, **kwargs): try: return func(self, *args, **kwargs) except connect_exceptions as e: logging.error('RabbitMQ error: %r, reconnect.', e) self.reconnect() return func(self, *args, **kwargs) return wrap class PikaQueue(object): """ A Queue like rabbitmq connector """ Empty = BaseQueue.Empty Full = BaseQueue.Full max_timeout = 0.3 def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F', maxsize=0, lazy_limit=True): """ Constructor for a PikaQueue. Not works with python 3. Default for python 2. amqp_url: https://www.rabbitmq.com/uri-spec.html maxsize: an integer that sets the upperbound limit on the number of items that can be placed in the queue. lazy_limit: as rabbitmq is shared between multipul instance, for a strict limit on the number of items in the queue. PikaQueue have to update current queue size before every put operation. When `lazy_limit` is enabled, PikaQueue will check queue size every max_size / 10 put operation for better performace. """ self.name = name self.amqp_url = amqp_url self.maxsize = maxsize self.lock = threading.RLock() self.lazy_limit = lazy_limit if self.lazy_limit and self.maxsize: self.qsize_diff_limit = int(self.maxsize * 0.1) else: self.qsize_diff_limit = 0 self.qsize_diff = 0 self.reconnect() def reconnect(self): """Reconnect to rabbitmq server""" import pika import pika.exceptions self.connection = pika.BlockingConnection(pika.URLParameters(self.amqp_url)) self.channel = self.connection.channel() try: self.channel.queue_declare(self.name) except pika.exceptions.ChannelClosed: self.connection = pika.BlockingConnection(pika.URLParameters(self.amqp_url)) self.channel = self.connection.channel() #self.channel.queue_purge(self.name) @catch_error def qsize(self): with self.lock: ret = self.channel.queue_declare(self.name, passive=True) return ret.method.message_count def empty(self): if self.qsize() == 0: return True else: return False def full(self): if self.maxsize and self.qsize() >= self.maxsize: return True else: return False @catch_error def put(self, obj, block=True, timeout=None): if not block: return self.put_nowait() start_time = time.time() while True: try: return self.put_nowait(obj) except BaseQueue.Full: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) @catch_error def put_nowait(self, obj): if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit: pass elif self.full(): raise BaseQueue.Full else: self.qsize_diff = 0 with self.lock: self.qsize_diff += 1 return self.channel.basic_publish("", self.name, umsgpack.packb(obj)) @catch_error def get(self, block=True, timeout=None, ack=False): if not block: return self.get_nowait() start_time = time.time() while True: try: return self.get_nowait(ack) except BaseQueue.Empty: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) @catch_error def get_nowait(self, ack=False): with self.lock: method_frame, header_frame, body = self.channel.basic_get(self.name, not ack) if method_frame is None: raise BaseQueue.Empty if ack: self.channel.basic_ack(method_frame.delivery_tag) return umsgpack.unpackb(body) @catch_error def delete(self): with self.lock: return self.channel.queue_delete(queue=self.name) class AmqpQueue(PikaQueue): Empty = BaseQueue.Empty Full = BaseQueue.Full max_timeout = 0.3 def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F', maxsize=0, lazy_limit=True): """ Constructor for a AmqpQueue. Default for python 3. amqp_url: https://www.rabbitmq.com/uri-spec.html maxsize: an integer that sets the upperbound limit on the number of items that can be placed in the queue. lazy_limit: as rabbitmq is shared between multipul instance, for a strict limit on the number of items in the queue. PikaQueue have to update current queue size before every put operation. When `lazy_limit` is enabled, PikaQueue will check queue size every max_size / 10 put operation for better performace. """ self.name = name self.amqp_url = amqp_url self.maxsize = maxsize self.lock = threading.RLock() self.lazy_limit = lazy_limit if self.lazy_limit and self.maxsize: self.qsize_diff_limit = int(self.maxsize * 0.1) else: self.qsize_diff_limit = 0 self.qsize_diff = 0 self.reconnect() def reconnect(self): """Reconnect to rabbitmq server""" parsed = urlparse.urlparse(self.amqp_url) port = parsed.port or 5672 self.connection = amqp.Connection(host="%s:%s" % (parsed.hostname, port), userid=parsed.username or 'guest', password=parsed.password or 'guest', virtual_host=unquote( parsed.path.lstrip('/') or '%2F')) self.channel = self.connection.channel() try: self.channel.queue_declare(self.name) except amqp.exceptions.PreconditionFailed: pass #self.channel.queue_purge(self.name) @catch_error def qsize(self): with self.lock: name, message_count, consumer_count = self.channel.queue_declare( self.name, passive=True) return message_count @catch_error def put_nowait(self, obj): if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit: pass elif self.full(): raise BaseQueue.Full else: self.qsize_diff = 0 with self.lock: self.qsize_diff += 1 msg = amqp.Message(umsgpack.packb(obj)) return self.channel.basic_publish(msg, exchange="", routing_key=self.name) @catch_error def get_nowait(self, ack=False): with self.lock: message = self.channel.basic_get(self.name, not ack) if message is None: raise BaseQueue.Empty if ack: self.channel.basic_ack(message.delivery_tag) return umsgpack.unpackb(message.body) Queue = AmqpQueue PKd6[IV9xx#weblocust/message_queue/__init__.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2015-04-30 21:47:08 # Modified on 2016-10-26 20:46:20 try: from urllib import parse as urlparse except ImportError: import urlparse def connect_message_queue(name, url=None, maxsize=0): """ create connection to message queue name: name of message queue rabbitmq: amqp://username:password@host:5672/%2F see https://www.rabbitmq.com/uri-spec.html beanstalk: beanstalk://host:11300/ redis: redis://host:6379/db kombu: kombu+transport://userid:password@hostname:port/virtual_host see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls builtin: None """ if not url: from weblocust.libs.multiprocessing_queue import Queue return Queue(maxsize=maxsize) parsed = urlparse.urlparse(url) if parsed.scheme == 'amqp': from .rabbitmq import Queue return Queue(name, url, maxsize=maxsize) elif parsed.scheme == 'beanstalk': from .beanstalk import Queue return Queue(name, host=parsed.netloc, maxsize=maxsize) elif parsed.scheme == 'redis': from .redis_queue import Queue db = parsed.path.lstrip('/').split('/') try: db = int(db[0]) except: db = 0 password = parsed.password or None return Queue(name, parsed.hostname, parsed.port, db=db, maxsize=maxsize, password=password) else: if url.startswith('kombu+'): url = url[len('kombu+'):] from .kombu_queue import Queue return Queue(name, url, maxsize=maxsize) raise Exception('unknow connection url: %s', url) PKd6[IGQZ Z &weblocust/message_queue/redis_queue.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2015-04-27 22:48:04 # Modified on 2016-10-26 20:46:20 import time import redis import umsgpack from six.moves import queue as BaseQueue class RedisQueue(object): """ A Queue like message built over redis """ Empty = BaseQueue.Empty Full = BaseQueue.Full max_timeout = 0.3 def __init__(self, name, host='localhost', port=6379, db=0, maxsize=0, lazy_limit=True, password=None): """ Constructor for RedisQueue maxsize: an integer that sets the upperbound limit on the number of items that can be placed in the queue. lazy_limit: redis queue is shared via instance, a lazy size limit is used for better performance. """ self.name = name self.redis = redis.StrictRedis(host=host, port=port, db=db, password=password) self.maxsize = maxsize self.lazy_limit = lazy_limit self.last_qsize = 0 def qsize(self): self.last_qsize = self.redis.llen(self.name) return self.last_qsize def empty(self): if self.qsize() == 0: return True else: return False def full(self): if self.maxsize and self.qsize() >= self.maxsize: return True else: return False def put_nowait(self, obj): if self.lazy_limit and self.last_qsize < self.maxsize: pass elif self.full(): raise self.Full self.last_qsize = self.redis.rpush(self.name, umsgpack.packb(obj)) return True def put(self, obj, block=True, timeout=None): if not block: return self.put_nowait() start_time = time.time() while True: try: return self.put_nowait(obj) except self.Full: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) def get_nowait(self): ret = self.redis.lpop(self.name) if ret is None: raise self.Empty return umsgpack.unpackb(ret) def get(self, block=True, timeout=None): if not block: return self.get_nowait() start_time = time.time() while True: try: return self.get_nowait() except self.Empty: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) Queue = RedisQueue PKd6[I#x: : &weblocust/message_queue/kombu_queue.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2015-05-22 20:54:01 # Modified on 2016-10-26 20:46:20 import time import umsgpack from kombu import Connection, enable_insecure_serializers from kombu.serialization import register from kombu.exceptions import ChannelError from six.moves import queue as BaseQueue register('umsgpack', umsgpack.packb, umsgpack.unpackb, 'application/x-msgpack') enable_insecure_serializers(['umsgpack']) class KombuQueue(object): """ kombu is a high-level interface for multiple message queue backends. KombuQueue is built on top of kombu API. """ Empty = BaseQueue.Empty Full = BaseQueue.Full max_timeout = 0.3 def __init__(self, name, url="amqp://", maxsize=0, lazy_limit=True): """ Constructor for KombuQueue url: http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls maxsize: an integer that sets the upperbound limit on the number of items that can be placed in the queue. """ self.name = name self.conn = Connection(url) self.queue = self.conn.SimpleQueue(self.name, no_ack=True, serializer='umsgpack') self.maxsize = maxsize self.lazy_limit = lazy_limit if self.lazy_limit and self.maxsize: self.qsize_diff_limit = int(self.maxsize * 0.1) else: self.qsize_diff_limit = 0 self.qsize_diff = 0 def qsize(self): try: return self.queue.qsize() except ChannelError: return 0 def empty(self): if self.qsize() == 0: return True else: return False def full(self): if self.maxsize and self.qsize() >= self.maxsize: return True else: return False def put(self, obj, block=True, timeout=None): if not block: return self.put_nowait() start_time = time.time() while True: try: return self.put_nowait(obj) except BaseQueue.Full: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) def put_nowait(self, obj): if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit: pass elif self.full(): raise BaseQueue.Full else: self.qsize_diff = 0 return self.queue.put(obj) def get(self, block=True, timeout=None): try: ret = self.queue.get(block, timeout) return ret.payload except self.queue.Empty: raise BaseQueue.Empty def get_nowait(self): try: ret = self.queue.get_nowait() return ret.payload except self.queue.Empty: raise BaseQueue.Empty def delete(self): self.queue.queue.delete() def __del__(self): self.queue.close() Queue = KombuQueue PKd6[I%%%weblocust/fetcher/__init__.pyfrom .tornado_fetcher import Fetcher PKd6[IxwZwZ$weblocust/fetcher/tornado_fetcher.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2012-12-17 11:07:19 # Modified on 2016-10-26 20:46:20 from __future__ import unicode_literals import six import copy import time import json import logging import functools import threading import tornado.ioloop import tornado.httputil import tornado.httpclient import weblocust from six.moves import queue, http_cookies from six.moves.urllib.robotparser import RobotFileParser from requests import cookies from six.moves.urllib.parse import urljoin, urlsplit from tornado import gen from tornado.curl_httpclient import CurlAsyncHTTPClient from tornado.simple_httpclient import SimpleAsyncHTTPClient from weblocust.libs import utils, dataurl, counter from weblocust.libs.url import quote_chinese from .cookie_utils import extract_cookies_to_jar logger = logging.getLogger('fetcher') class MyCurlAsyncHTTPClient(CurlAsyncHTTPClient): def free_size(self): return len(self._free_list) def size(self): return len(self._curls) - self.free_size() class MySimpleAsyncHTTPClient(SimpleAsyncHTTPClient): def free_size(self): return self.max_clients - self.size() def size(self): return len(self.active) fetcher_output = { "status_code": int, "orig_url": str, "url": str, "headers": dict, "content": str, "cookies": dict, } class Fetcher(object): user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36" default_options = { 'method': 'GET', 'headers': { }, 'use_gzip': True, 'timeout': 120, } phantomjs_proxy = None robot_txt_age = 60*60 # 1h def __init__(self, inqueue, outqueue, poolsize=100, proxy=None, async=True): self.inqueue = inqueue self.outqueue = outqueue self.poolsize = poolsize self._running = False self._quit = False self.proxy = proxy self.async = async self.ioloop = tornado.ioloop.IOLoop() self.robots_txt_cache = {} # binding io_loop to http_client here if self.async: self.http_client = MyCurlAsyncHTTPClient(max_clients=self.poolsize, io_loop=self.ioloop) else: self.http_client = tornado.httpclient.HTTPClient(MyCurlAsyncHTTPClient, max_clients=self.poolsize) self._cnt = { '5m': counter.CounterManager( lambda: counter.TimebaseAverageWindowCounter(30, 10)), '1h': counter.CounterManager( lambda: counter.TimebaseAverageWindowCounter(60, 60)), } def send_result(self, type, task, result): '''Send fetch result to processor''' if self.outqueue: try: self.outqueue.put((task, result)) except Exception as e: logger.exception(e) def fetch(self, task, callback=None): if self.async: return self.async_fetch(task, callback) else: return self.async_fetch(task, callback).result() @gen.coroutine def async_fetch(self, task, callback=None): '''Do one fetch''' url = task.get('url', 'data:,') if callback is None: callback = self.send_result type = 'None' try: if url.startswith('data:'): type = 'data' result = yield gen.maybe_future(self.data_fetch(url, task)) elif task.get('fetch', {}).get('fetch_type') in ('js', 'phantomjs'): type = 'phantomjs' result = yield self.phantomjs_fetch(url, task) else: type = 'http' result = yield self.http_fetch(url, task) except Exception as e: logger.exception(e) result = self.handle_error(type, url, task, e) callback(type, task, result) self.on_result(type, task, result) raise gen.Return(result) def sync_fetch(self, task): '''Synchronization fetch, usually used in xmlrpc thread''' if not self._running: return self.ioloop.run_sync(functools.partial(self.async_fetch, task, lambda t, _, r: True)) wait_result = threading.Condition() _result = {} def callback(type, task, result): wait_result.acquire() _result['type'] = type _result['task'] = task _result['result'] = result wait_result.notify() wait_result.release() wait_result.acquire() self.ioloop.add_callback(self.fetch, task, callback) while 'result' not in _result: wait_result.wait() wait_result.release() return _result['result'] def data_fetch(self, url, task): '''A fake fetcher for dataurl''' self.on_fetch('data', task) result = {} result['orig_url'] = url result['content'] = dataurl.decode(url) result['headers'] = {} result['status_code'] = 200 result['url'] = url result['cookies'] = {} result['time'] = 0 result['save'] = task.get('fetch', {}).get('save') if len(result['content']) < 70: logger.info("[200] %s:%s %s 0s", task.get('project'), task.get('taskid'), url) else: logger.info( "[200] %s:%s data:,%s...[content:%d] 0s", task.get('project'), task.get('taskid'), result['content'][:70], len(result['content']) ) return result def handle_error(self, type, url, task, start_time, error): result = { 'status_code': getattr(error, 'code', 599), 'error': utils.text(error), 'content': "", 'time': time.time() - start_time, 'orig_url': url, 'url': url, "save": task.get('fetch', {}).get('save') } logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, error, result['time']) return result allowed_options = ['method', 'data', 'timeout', 'cookies', 'use_gzip', 'validate_cert'] def pack_tornado_request_parameters(self, url, task): fetch = copy.deepcopy(self.default_options) fetch['url'] = url fetch['headers'] = tornado.httputil.HTTPHeaders(fetch['headers']) fetch['headers']['User-Agent'] = self.user_agent task_fetch = task.get('fetch', {}) for each in self.allowed_options: if each in task_fetch: fetch[each] = task_fetch[each] fetch['headers'].update(task_fetch.get('headers', {})) if task.get('track'): track_headers = tornado.httputil.HTTPHeaders( task.get('track', {}).get('fetch', {}).get('headers') or {}) track_ok = task.get('track', {}).get('process', {}).get('ok', False) else: track_headers = {} track_ok = False # proxy proxy_string = None if isinstance(task_fetch.get('proxy'), six.string_types): proxy_string = task_fetch['proxy'] elif self.proxy and task_fetch.get('proxy', True): proxy_string = self.proxy if proxy_string: if '://' not in proxy_string: proxy_string = 'http://' + proxy_string proxy_splited = urlsplit(proxy_string) if proxy_splited.username: fetch['proxy_username'] = proxy_splited.username if six.PY2: fetch['proxy_username'] = fetch['proxy_username'].encode('utf8') if proxy_splited.password: fetch['proxy_password'] = proxy_splited.password if six.PY2: fetch['proxy_password'] = fetch['proxy_password'].encode('utf8') fetch['proxy_host'] = proxy_splited.hostname.encode('utf8') if six.PY2: fetch['proxy_host'] = fetch['proxy_host'].encode('utf8') fetch['proxy_port'] = proxy_splited.port or 8080 # etag if task_fetch.get('etag', True): _t = None if isinstance(task_fetch.get('etag'), six.string_types): _t = task_fetch.get('etag') elif track_ok: _t = track_headers.get('etag') if _t and 'If-None-Match' not in fetch['headers']: fetch['headers']['If-None-Match'] = _t # last modifed if task_fetch.get('last_modified', task_fetch.get('last_modifed', True)): last_modified = task_fetch.get('last_modified', task_fetch.get('last_modifed', True)) _t = None if isinstance(last_modified, six.string_types): _t = last_modified elif track_ok: _t = track_headers.get('last-modified') if _t and 'If-Modified-Since' not in fetch['headers']: fetch['headers']['If-Modified-Since'] = _t # timeout if 'timeout' in fetch: fetch['connect_timeout'] = fetch['request_timeout'] = fetch['timeout'] del fetch['timeout'] # data rename to body if 'data' in fetch: fetch['body'] = fetch['data'] del fetch['data'] return fetch @gen.coroutine def can_fetch(self, user_agent, url): parsed = urlsplit(url) domain = parsed.netloc if domain in self.robots_txt_cache: robot_txt = self.robots_txt_cache[domain] if time.time() - robot_txt.mtime() > self.robot_txt_age: robot_txt = None else: robot_txt = None if robot_txt is None: robot_txt = RobotFileParser() try: response = yield gen.maybe_future(self.http_client.fetch( urljoin(url, '/robots.txt'), connect_timeout=10, request_timeout=30)) content = response.body except tornado.httpclient.HTTPError as e: logger.error('load robots.txt from %s error: %r', domain, e) content = '' try: content = content.decode('utf8', 'ignore') except UnicodeDecodeError: content = '' robot_txt.parse(content.splitlines()) self.robots_txt_cache[domain] = robot_txt raise gen.Return(robot_txt.can_fetch(user_agent, url)) def clear_robot_txt_cache(self): now = time.time() for domain, robot_txt in self.robots_txt_cache.items(): if now - robot_txt.mtime() > self.robot_txt_age: del self.robots_txt_cache[domain] @gen.coroutine def http_fetch(self, url, task): '''HTTP fetcher''' start_time = time.time() self.on_fetch('http', task) handle_error = lambda x: self.handle_error('http', url, task, start_time, x) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) session = cookies.RequestsCookieJar() # fix for tornado request obj if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] max_redirects = task_fetch.get('max_redirects', 5) # we will handle redirects by hand to capture cookies fetch['follow_redirects'] = False # making requests while True: # robots.txt if task_fetch.get('robots_txt', False): can_fetch = yield self.can_fetch(fetch['headers']['User-Agent'], fetch['url']) if not can_fetch: error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) try: request = tornado.httpclient.HTTPRequest(**fetch) # if cookie already in header, get_cookie_header wouldn't work old_cookie_header = request.headers.get('Cookie') if old_cookie_header: del request.headers['Cookie'] cookie_header = cookies.get_cookie_header(session, request) if cookie_header: request.headers['Cookie'] = cookie_header elif old_cookie_header: request.headers['Cookie'] = old_cookie_header except Exception as e: logger.exception(fetch) raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future(self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) extract_cookies_to_jar(session, response.request, response.headers) if (response.code in (301, 302, 303, 307) and response.headers.get('Location') and task_fetch.get('allow_redirects', True)): if max_redirects <= 0: error = tornado.httpclient.HTTPError( 599, 'Maximum (%d) redirects followed' % task_fetch.get('max_redirects', 5), response) raise gen.Return(handle_error(error)) if response.code in (302, 303): fetch['method'] = 'GET' if 'body' in fetch: del fetch['body'] fetch['url'] = quote_chinese(urljoin(fetch['url'], response.headers['Location'])) fetch['request_timeout'] -= time.time() - start_time if fetch['request_timeout'] < 0: fetch['request_timeout'] = 0.1 fetch['connect_timeout'] = fetch['request_timeout'] max_redirects -= 1 continue result = {} result['orig_url'] = url result['content'] = response.body or '' result['headers'] = dict(response.headers) result['status_code'] = response.code result['url'] = response.effective_url or url result['time'] = time.time() - start_time result['cookies'] = session.get_dict() result['save'] = task_fetch.get('save') if response.error: result['error'] = utils.text(response.error) if 200 <= response.code < 300: logger.info("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) else: logger.warning("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) raise gen.Return(result) @gen.coroutine def phantomjs_fetch(self, url, task): '''Fetch with phantomjs proxy''' start_time = time.time() self.on_fetch('phantomjs', task) handle_error = lambda x: self.handle_error('phantomjs', url, task, start_time, x) # check phantomjs proxy is enabled if not self.phantomjs_proxy: result = { "orig_url": url, "content": "phantomjs is not enabled.", "headers": {}, "status_code": 501, "url": url, "time": time.time() - start_time, "cookies": {}, "save": task.get('fetch', {}).get('save') } logger.warning("[501] %s:%s %s 0s", task.get('project'), task.get('taskid'), url) raise gen.Return(result) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) for each in task_fetch: if each not in fetch: fetch[each] = task_fetch[each] # robots.txt if task_fetch.get('robots_txt', False): user_agent = fetch['headers']['User-Agent'] can_fetch = yield self.can_fetch(user_agent, url) if not can_fetch: error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) request_conf = { 'follow_redirects': False } request_conf['connect_timeout'] = fetch.get('connect_timeout', 120) request_conf['request_timeout'] = fetch.get('request_timeout', 120) session = cookies.RequestsCookieJar() request = tornado.httpclient.HTTPRequest(url=fetch['url']) if fetch.get('cookies'): session.update(fetch['cookies']) if 'Cookie' in request.headers: del request.headers['Cookie'] fetch['headers']['Cookie'] = cookies.get_cookie_header(session, request) # making requests fetch['headers'] = dict(fetch['headers']) try: request = tornado.httpclient.HTTPRequest( url="%s" % self.phantomjs_proxy, method="POST", body=json.dumps(fetch), **request_conf) except Exception as e: raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future(self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) if not response.body: raise gen.Return(handle_error(Exception('no response from phantomjs'))) result = {} try: result = json.loads(utils.text(response.body)) except Exception as e: if response.error: result['error'] = utils.text(response.error) raise gen.Return(handle_error(e)) if result.get('status_code', 200): logger.info("[%d] %s:%s %s %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['time']) else: logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['content'], result['time']) raise gen.Return(result) def run(self): '''Run loop''' logger.info("fetcher starting...") def queue_loop(): if not self.outqueue or not self.inqueue: return while not self._quit: try: if self.outqueue.full(): break if self.http_client.free_size() <= 0: break task = self.inqueue.get_nowait() # FIXME: decode unicode_obj should used after data selete from # database, it's used here for performance task = utils.decode_unicode_obj(task) self.fetch(task) except queue.Empty: break except KeyboardInterrupt: break except Exception as e: logger.exception(e) break tornado.ioloop.PeriodicCallback(queue_loop, 100, io_loop=self.ioloop).start() tornado.ioloop.PeriodicCallback(self.clear_robot_txt_cache, 10000, io_loop=self.ioloop).start() self._running = True try: self.ioloop.start() except KeyboardInterrupt: pass logger.info("fetcher exiting...") def quit(self): '''Quit fetcher''' self._running = False self._quit = True self.ioloop.add_callback(self.ioloop.stop) if hasattr(self, 'xmlrpc_server'): self.xmlrpc_ioloop.add_callback(self.xmlrpc_server.stop) self.xmlrpc_ioloop.add_callback(self.xmlrpc_ioloop.stop) def size(self): return self.http_client.size() def xmlrpc_run(self, port=24444, bind='127.0.0.1', logRequests=False): '''Run xmlrpc server''' import umsgpack from weblocust.libs.wsgi_xmlrpc import WSGIXMLRPCApplication try: from xmlrpc.client import Binary except ImportError: from xmlrpclib import Binary application = WSGIXMLRPCApplication() application.register_function(self.quit, '_quit') application.register_function(self.size) def sync_fetch(task): result = self.sync_fetch(task) result = Binary(umsgpack.packb(result)) return result application.register_function(sync_fetch, 'fetch') def dump_counter(_time, _type): return self._cnt[_time].to_dict(_type) application.register_function(dump_counter, 'counter') import tornado.wsgi import tornado.ioloop import tornado.httpserver container = tornado.wsgi.WSGIContainer(application) self.xmlrpc_ioloop = tornado.ioloop.IOLoop() self.xmlrpc_server = tornado.httpserver.HTTPServer(container, io_loop=self.xmlrpc_ioloop) self.xmlrpc_server.listen(port=port, address=bind) self.xmlrpc_ioloop.start() def on_fetch(self, type, task): '''Called before task fetch''' pass def on_result(self, type, task, result): '''Called after task fetched''' status_code = result.get('status_code', 599) if status_code != 599: status_code = (int(status_code) / 100 * 100) self._cnt['5m'].event((task.get('project'), status_code), +1) self._cnt['1h'].event((task.get('project'), status_code), +1) if type in ('http', 'phantomjs') and result.get('time'): content_len = len(result.get('content', '')) self._cnt['5m'].event((task.get('project'), 'speed'), float(content_len) / result.get('time')) self._cnt['1h'].event((task.get('project'), 'speed'), float(content_len) / result.get('time')) self._cnt['5m'].event((task.get('project'), 'time'), result.get('time')) self._cnt['1h'].event((task.get('project'), 'time'), result.get('time')) PKl[I:Y&weblocust/fetcher/phantomjs_fetcher.js// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: // Author: Binux // http://binux.me // Created on 2014-10-29 22:12:14 // run this file as : // phantomjs --ssl-protocol=any --disk-cache=true /path/to/this_js_file.js 25555 var port, server, service, wait_before_end = 1000, system = require('system'), webpage = require('webpage'); if (system.args.length !== 2) { console.log('Usage: simpleserver.js '); phantom.exit(1); } else { port = system.args[1]; server = require('webserver').create(); console.debug = function(){}; service = server.listen(port, { 'keepAlive': false }, function (request, response) { phantom.clearCookies(); //console.debug(JSON.stringify(request, null, 4)); // check method if (request.method == 'GET') { body = "method not allowed!"; response.statusCode = 403; response.headers = { 'Cache': 'no-cache', 'Content-Length': body.length }; response.write(body); response.closeGracefully(); return; } var first_response = null, finished = false, page_loaded = false, start_time = Date.now(), end_time = null, script_executed = false, script_result = null; var fetch = JSON.parse(request.postRaw); console.debug(JSON.stringify(fetch, null, 2)); // create and set page var page = webpage.create(); page.onConsoleMessage = function(msg) { console.log('console: ' + msg); }; page.viewportSize = { width: fetch.js_viewport_width || 1024, height: fetch.js_viewport_height || 768*3 } if (fetch.headers) { fetch.headers['Accept-Encoding'] = undefined; fetch.headers['Connection'] = undefined; fetch.headers['Content-Length'] = undefined; } if (fetch.headers && fetch.headers['User-Agent']) { page.settings.userAgent = fetch.headers['User-Agent']; } // this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903 page.settings.loadImages = fetch.load_images === undefined ? true : fetch.load_images; page.settings.resourceTimeout = fetch.timeout ? fetch.timeout * 1000 : 120*1000; if (fetch.headers) { page.customHeaders = fetch.headers; } // add callbacks page.onInitialized = function() { if (!script_executed && fetch.js_script && fetch.js_run_at === "document-start") { script_executed = true; console.log('running document-start script.'); script_result = page.evaluateJavaScript(fetch.js_script); } }; page.onLoadFinished = function(status) { page_loaded = true; if (!script_executed && fetch.js_script && fetch.js_run_at !== "document-start") { script_executed = true; console.log('running document-end script.'); script_result = page.evaluateJavaScript(fetch.js_script); } console.debug("waiting "+wait_before_end+"ms before finished."); end_time = Date.now() + wait_before_end; setTimeout(make_result, wait_before_end+10, page); }; page.onResourceRequested = function(request) { console.debug("Starting request: #"+request.id+" ["+request.method+"]"+request.url); end_time = null; }; page.onResourceReceived = function(response) { console.debug("Request finished: #"+response.id+" ["+response.status+"]"+response.url); if (first_response === null && response.status != 301 && response.status != 302) { first_response = response; } if (page_loaded) { console.debug("waiting "+wait_before_end+"ms before finished."); end_time = Date.now() + wait_before_end; setTimeout(make_result, wait_before_end+10, page); } } page.onResourceError = page.onResourceTimeout=function(response) { console.info("Request error: #"+response.id+" ["+response.errorCode+"="+response.errorString+"]"+response.url); if (first_response === null) { first_response = response; } if (page_loaded) { console.debug("waiting "+wait_before_end+"ms before finished."); end_time = Date.now() + wait_before_end; setTimeout(make_result, wait_before_end+10, page); } } // make sure request will finished setTimeout(function(page) { make_result(page); }, page.settings.resourceTimeout + 100, page); // send request page.open(fetch.url, { operation: fetch.method, data: fetch.data, }); // make response function make_result(page) { if (finished) { return; } if (Date.now() - start_time < page.settings.resourceTimeout) { if (!!!end_time) { return; } if (end_time > Date.now()) { setTimeout(make_result, Date.now() - end_time, page); return; } } var result = {}; try { result = _make_result(page); } catch (e) { result = { orig_url: fetch.url, status_code: 599, error: e.toString(), content: '', headers: {}, url: page.url, cookies: {}, time: (Date.now() - start_time) / 1000, save: fetch.save } } page.close(); finished = true; console.log("["+result.status_code+"] "+result.orig_url+" "+result.time) var body = JSON.stringify(result, null, 2); response.writeHead(200, { 'Cache': 'no-cache', 'Content-Type': 'application/json', }); response.write(body); response.closeGracefully(); } function _make_result(page) { if (first_response === null) { throw "No response received!"; } var cookies = {}; page.cookies.forEach(function(e) { cookies[e.name] = e.value; }); var headers = {}; if (first_response.headers) { first_response.headers.forEach(function(e) { headers[e.name] = e.value; }); } return { orig_url: fetch.url, status_code: first_response.status || 599, error: first_response.errorString, content: page.content, headers: headers, url: page.url, cookies: cookies, time: (Date.now() - start_time) / 1000, js_script_result: script_result, save: fetch.save } } }); if (service) { console.log('proxy server phantomjs running on port ' + port); } else { console.log('Error: Could not create web server listening on port ' + port); phantom.exit(); } } PKd6[I9|!weblocust/fetcher/cookie_utils.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-12-14 09:07:11 # Modified on 2016-10-26 20:46:20 from requests.cookies import MockRequest class MockResponse(object): def __init__(self, headers): self._headers = headers def info(self): return self def getheaders(self, name): """make cookie python 2 version use this method to get cookie list""" return self._headers.get_list(name) def get_all(self, name, default=[]): """make cookie python 3 version use this instead of getheaders""" return self._headers.get_list(name) or default def extract_cookies_to_jar(jar, request, response): req = MockRequest(request) res = MockResponse(response) jar.extract_cookies(res, req) PKd6[IL{ { weblocust/webui/result.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-10-19 16:23:55 # Modified on 2016-10-26 20:46:20 from __future__ import unicode_literals from flask import render_template, request, json from flask import Response from .app import app from weblocust.libs import result_dump @app.route('/results') def result(): resultdb = app.config['resultdb'] project = request.args.get('project') offset = int(request.args.get('offset', 0)) limit = int(request.args.get('limit', 20)) count = resultdb.count(project) results = list(resultdb.select(project, offset=offset, limit=limit)) return render_template( "result.html", count=count, results=results, result_formater=result_dump.result_formater, project=project, offset=offset, limit=limit, json=json ) # chrome may add / automaticly,thus causing 404 Error @app.route('/result-list///',methods=['GET', ]) @app.route('/result-list////',methods=['GET', ]) def crawleddata(project,item_per_page,page): resultdb = app.config['resultdb'] #project = request.args.get('project') #offset = int(request.args.get('offset', 0)) offset = int(item_per_page)*int(page-1) limit = int(item_per_page) count = resultdb.count(project) results = list(resultdb.select(project, offset=offset, limit=limit)) #print count,project,results reply = { "project":project, "count":count, "results":results, } #print "ccccc" return json.dumps(reply),200,{'Content-Type': 'application/json'} @app.route('/result-list/////',methods=['GET', ]) def showdata(project,item_per_page,page,refer): # refer:__self__ :不是外键,不是别人的一部分 # extraid:__main__:不是一页当中产生的多个结果之一 resultdb = app.config['resultdb'] offset = int(item_per_page)*int(page-1) limit = int(item_per_page) count = resultdb.count_by(project,condition={"refer":refer}) results = list(resultdb.select_by(project, condition={"refer":refer},offset=offset, limit=limit)) reply = { "project":project, "count":count, "results":results, } return json.dumps(reply),200,{'Content-Type': 'application/json'} @app.route('/results/dump/.<_format>') def dump_result(project, _format): resultdb = app.config['resultdb'] # force update project list resultdb.get(project, 'any') if project not in resultdb.projects: return "no such project.", 404 offset = int(request.args.get('offset', 0)) or None limit = int(request.args.get('limit', 0)) or None results = resultdb.select(project, offset=offset, limit=limit) if _format == 'json': valid = request.args.get('style', 'rows') == 'full' return Response(result_dump.dump_as_json(results, valid), mimetype='application/json') elif _format == 'txt': return Response(result_dump.dump_as_txt(results), mimetype='text/plain') elif _format == 'csv': return Response(result_dump.dump_as_csv(results), mimetype='text/csv') PKd6[Ivvweblocust/webui/app.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-22 23:17:13 # Modified on 2016-10-26 20:46:20 import os import sys import logging logger = logging.getLogger("webui") from six import reraise from six.moves import builtins from six.moves.urllib.parse import urljoin from flask import Flask from weblocust.fetcher import tornado_fetcher if os.name == 'nt': import mimetypes mimetypes.add_type("text/css", ".css", True) class QuitableFlask(Flask): """Add quit() method to Flask object""" @property def logger(self): return logger def run(self, host=None, port=None, debug=None, **options): import tornado.wsgi import tornado.ioloop import tornado.httpserver import tornado.web if host is None: host = '0.0.0.0' if port is None: server_name = self.config['SERVER_NAME'] if server_name and ':' in server_name: port = int(server_name.rsplit(':', 1)[1]) else: port = 5000 if debug is not None: self.debug = bool(debug) hostname = host port = port application = self use_reloader = self.debug use_debugger = self.debug if use_debugger: from werkzeug.debug import DebuggedApplication application = DebuggedApplication(application, True) try: from .webdav import dav_app except ImportError as e: logger.error('WebDav interface not enabled: %r', e) dav_app = None if dav_app: from werkzeug.wsgi import DispatcherMiddleware application = DispatcherMiddleware(application, { '/dav': dav_app }) container = tornado.wsgi.WSGIContainer(application) self.http_server = tornado.httpserver.HTTPServer(container) self.http_server.listen(port, hostname) if use_reloader: from tornado import autoreload autoreload.start() self.logger.info('webui running on %s:%s', hostname, port) self.ioloop = tornado.ioloop.IOLoop.current() self.ioloop.start() def quit(self): if hasattr(self, 'ioloop'): self.ioloop.add_callback(self.http_server.stop) self.ioloop.add_callback(self.ioloop.stop) self.logger.info('webui exiting...') app = QuitableFlask('webui', static_folder=os.path.join(os.path.dirname(__file__), 'static'), template_folder=os.path.join(os.path.dirname(__file__), 'templates')) app.secret_key = os.urandom(24) app.jinja_env.line_statement_prefix = '#' app.jinja_env.globals.update(builtins.__dict__) app.config.update({ 'fetch': lambda x: tornado_fetcher.Fetcher(None, None, async=False).fetch(x), 'taskdb': None, 'projectdb': None, 'scheduler_rpc': None, 'queues': dict(), }) def cdn_url_handler(error, endpoint, kwargs): if endpoint == 'cdn': path = kwargs.pop('path') # cdn = app.config.get('cdn', 'http://cdn.staticfile.org/') # cdn = app.config.get('cdn', '//cdnjs.cloudflare.com/ajax/libs/') cdn = app.config.get('cdn', '//cdnjscn.b0.upaiyun.com/libs/') return urljoin(cdn, path) else: exc_type, exc_value, tb = sys.exc_info() if exc_value is error: reraise(exc_type, exc_value, tb) else: raise error app.handle_url_build_error = cdn_url_handler PKd6[IigԀ$$weblocust/webui/debug.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-23 00:19:06 # Modified on 2016-10-26 20:46:20 import sys import time import socket import inspect import datetime import traceback from flask import render_template, request, json #from flask.ext import login import flask_login as login from weblocust.libs import utils, sample_handler, dataurl from weblocust.libs.response import rebuild_response from weblocust.processor.project_module import ProjectManager, ProjectFinder from .app import app default_task = { 'taskid': 'data:,on_start', 'project': '', 'url': 'data:,on_start', 'process': { 'callback': 'on_start', }, } default_script = inspect.getsource(sample_handler) @app.route("/debug/clear/taskdb/",methods=['GET','POST']) def clear_taskdb(project): """ clear all the tasks in taskdb """ taskdb = app.config['taskdb'] resultdb = app.config['resultdb'] taskdb.remove(project) return json.dumps({"taskdbsize":taskdb.size(project),"resultdbsize":resultdb.size(project)}),200,{'Content-Type':"application/json"} @app.route("/debug/clear/resultdb/",methods=['GET','POST']) def clear_resultdb(project): """ clear all the results in resultdb """ taskdb = app.config['taskdb'] resultdb = app.config['resultdb'] resultdb.remove(project) return json.dumps({"taskdbsize":taskdb.size(project),"resultdbsize":resultdb.size(project)}),200,{'Content-Type':"application/json"} def get_project_info(project): """ """ taskdb = app.config['taskdb'] resultdb = app.config['resultdb'] result = {'taskdbsize':taskdb.size(project),'resultdbsize':resultdb.size(project)} return json.dumps(result),200,{'Content-Type':'application/json'} @app.route("/debug/info/",methods=['GET']) def project_info(project): """ project infomation """ taskdb = app.config['taskdb'] resultdb = app.config['resultdb'] result = {'taskdbsize':taskdb.size(project),'resultdbsize':resultdb.size(project)} return json.dumps(result),200,{'Content-Type':'application/json'} @app.route('/debug/', methods=['GET', 'POST']) def debug(project): """ project-name:news_163_com start-urls:http://news.163.com script-mode:script """ projectdb = app.config['projectdb'] if not projectdb.verify_project_name(project): return 'project name is not allowed!', 400 info = projectdb.get(project, fields=['name', 'script']) if info: script = info['script'] else: script = (default_script .replace('__DATE__', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) .replace('__PROJECT_NAME__', project) .replace('__START_URL__', request.values.get('start-urls') or '__START_URL__')) taskid = request.args.get('taskid') if taskid: taskdb = app.config['taskdb'] task = taskdb.get_task( project, taskid, ['taskid', 'project', 'url', 'fetch', 'process']) else: task = default_task default_task['project'] = project return render_template("debug.html", task=task, script=script, project_name=project) @app.route("/debug/create-project",methods=["POST"]) def create_project(): project_name = request.form.get("project-name") #debug(project_name) return json.dumps({"ok":1,"project_name":project_name}),200,{'Content-Type': 'application/json'} @app.before_first_request def enable_projects_import(): sys.meta_path.append(ProjectFinder(app.config['projectdb'])) @app.route('/debug//run', methods=['POST', ]) def run(project): start_time = time.time() try: task = utils.decode_unicode_obj(json.loads(request.form['task'])) except Exception: result = { 'fetch_result': "", 'logs': u'task json error', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } if request.form.get('webdav_mode') == 'true': projectdb = app.config['projectdb'] info = projectdb.get(project, fields=['name', 'script']) if not info: result = { 'fetch_result': "", 'logs': u' in wevdav mode, cannot load script', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info['script'] = info['script'] fetch_result = {} try: fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) module = ProjectManager.build_module(project_info, { 'debugger': True }) ret = module['instance'].run_task(module['module'], task, response) except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } else: result = { 'fetch_result': fetch_result, 'logs': ret.logstr(), 'follows': ret.follows, 'messages': ret.messages, 'result': ret.result, 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text if (response.headers.get('content-type', '').startswith('image')): result['fetch_result']['dataurl'] = dataurl.encode( response.content, response.headers['content-type']) try: # binary data can't encode to JSON, encode result as unicode obj # before send it to frontend return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': "", 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} @app.route('/debug//save', methods=['POST', ]) def save(project): projectdb = app.config['projectdb'] if not projectdb.verify_project_name(project): return 'project name is not allowed!', 400 script = request.form['script'] project_info = projectdb.get(project, fields=['name', 'status', 'group']) if project_info and 'lock' in projectdb.split_group(project_info.get('group')) \ and not login.current_user.is_active(): return app.login_response if project_info: # 做更新 info = { 'script': script, } if project_info.get('status') in ('DEBUG', 'RUNNING', ): info['status'] = 'CHECKING' projectdb.update(project, info) else: # 创建 info = { 'name': project, 'script': script, 'status': 'TODO', 'rate': app.config.get('max_rate', 1), 'burst': app.config.get('max_burst', 3), } projectdb.insert(project, info) ## # we need to ensure_index when new project created ## taskdb = app.config["taskdb"] resultdb = app.config['resultdb'] taskdb.ensure_index(project) resultdb.ensure_index(project) rpc = app.config['scheduler_rpc'] if rpc is not None: try: rpc.update_project() except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return 'rpc error', 200 return 'ok', 200 @app.route('/debug//get') def get_script(project): projectdb = app.config['projectdb'] if not projectdb.verify_project_name(project): return 'project name is not allowed!', 400 info = projectdb.get(project, fields=['name', 'script']) return json.dumps(utils.unicode_obj(info)), \ 200, {'Content-Type': 'application/json'} @app.route('/helper.js') def resizer_js(): host = request.headers['Host'] return render_template("helper.js", host=host), 200, {'Content-Type': 'application/javascript'} @app.route('/helper.html') def resizer_html(): height = request.args.get('height') script = request.args.get('script', '') return render_template("helper.html", height=height, script=script) PKd6[Ieweblocust/webui/index.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-22 23:20:39 # Modified on 2016-10-26 20:46:20 import socket from flask import render_template, request, json #from flask.ext import login import flask_login as login from .app import app index_fields = ['name', 'group', 'status', 'comments', 'rate', 'burst', 'updatetime'] @app.route('/previews-index') def index(): projectdb = app.config['projectdb'] projects = sorted(projectdb.get_all(fields=index_fields), key=lambda k: (0 if k['group'] else 1, k['group'], k['name'])) return render_template("index.html", projects=projects) ## # ## ## # add by qiulimao @2016.05 ## @app.route('/projects-list') def projectslist(): projectdb = app.config['projectdb'] projects = sorted(projectdb.get_all(fields=index_fields), key=lambda k: (0 if k['group'] else 1, k['group'], k['name'])) return json.dumps({"projects":projects}),200,{'Content-Type': 'application/json'} #return render_template("index.html", projects=projects) @app.route("/") @app.route("/angular") def angular_index(): return render_template("angularindex.html") @app.route('/queues') def get_queues(): def try_get_qsize(queue): if queue is None: return 'None' try: return queue.qsize() except Exception as e: return "%r" % e result = {} queues = app.config.get('queues', {}) for key in queues: result[key] = try_get_qsize(queues[key]) return json.dumps(result), 200, {'Content-Type': 'application/json'} @app.route('/update', methods=['POST', ]) def project_update(): projectdb = app.config['projectdb'] project = request.form['pk'] name = request.form['name'] value = request.form['value'] project_info = projectdb.get(project, fields=('name', 'group')) if not project_info: return "no such project.", 404 if 'lock' in projectdb.split_group(project_info.get('group')) \ and not login.current_user.is_active(): return app.login_response if name not in ('group', 'status', 'rate'): return 'unknown field: %s' % name, 400 if name == 'rate': value = value.split('/') if len(value) != 2: return 'format error: rate/burst', 400 rate = float(value[0]) burst = float(value[1]) update = { 'rate': min(rate, app.config.get('max_rate', rate)), 'burst': min(burst, app.config.get('max_burst', burst)), } else: update = { name: value } ret = projectdb.update(project, update) if ret: rpc = app.config['scheduler_rpc'] if rpc is not None: try: rpc.update_project() except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return 'rpc error', 200 return 'ok', 200 else: return 'update error', 500 @app.route('/counter') def counter(): rpc = app.config['scheduler_rpc'] if rpc is None: return json.dumps({}) result = {} try: for project, counter in rpc.counter('5m_time', 'avg').items(): result.setdefault(project, {})['5m_time'] = counter for project, counter in rpc.counter('5m', 'sum').items(): result.setdefault(project, {})['5m'] = counter for project, counter in rpc.counter('1h', 'sum').items(): result.setdefault(project, {})['1h'] = counter for project, counter in rpc.counter('1d', 'sum').items(): result.setdefault(project, {})['1d'] = counter for project, counter in rpc.counter('all', 'sum').items(): result.setdefault(project, {})['all'] = counter except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return json.dumps({}), 200, {'Content-Type': 'application/json'} return json.dumps(result), 200, {'Content-Type': 'application/json'} @app.route('/run', methods=['POST', ]) def runtask(): rpc = app.config['scheduler_rpc'] if rpc is None: return json.dumps({}) projectdb = app.config['projectdb'] project = request.form['project'] project_info = projectdb.get(project, fields=('name', 'group')) if not project_info: return "no such project.", 404 if 'lock' in projectdb.split_group(project_info.get('group')) \ and not login.current_user.is_active(): return app.login_response newtask = { "project": project, "taskid": "on_start", "url": "data:,on_start", "process": { "callback": "on_start", }, "schedule": { "age": 0, "priority": 9, "force_update": True, }, } try: ret = rpc.newtask(newtask) except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return json.dumps({"result": False}), 200, {'Content-Type': 'application/json'} return json.dumps({"result": ret}), 200, {'Content-Type': 'application/json'} @app.route('/robots.txt') def robots(): return """User-agent: * Disallow: / Allow: /$ Allow: /debug Disallow: /debug/*?taskid=* """, 200, {'Content-Type': 'text/plain'} PKd6[I> weblocust/webui/bench_test.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-12-08 22:31:17 # Modified on 2016-10-26 20:46:20 import random try: from urllib import urlencode except ImportError: from urllib.parse import urlencode from flask import request from .app import app @app.route('/bench') def bench_test(): total = int(request.args.get('total', 10000)) show = int(request.args.get('show', 20)) nlist = [random.randint(1, total) for _ in range(show)] result = [] result.append("") args = dict(request.args) for nl in nlist: args['n'] = nl argstr = urlencode(sorted(args.items()), doseq=True) result.append("follow {1}
".format(argstr, nl)) result.append("") return "".join(result) PKd6[I( pkkweblocust/webui/webdav.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2015-6-3 11:29 # Modified on 2016-10-26 20:46:20 import os import re import time import base64 from six import BytesIO from wsgidav.wsgidav_app import DEFAULT_CONFIG, WsgiDAVApp from wsgidav.dav_provider import DAVProvider, DAVCollection, DAVNonCollection from wsgidav.dav_error import DAVError, HTTP_NOT_FOUND, HTTP_FORBIDDEN from weblocust.libs.utils import utf8, text from .app import app class ContentIO(BytesIO): def close(self): self.content = self.getvalue() BytesIO.close(self) class ScriptResource(DAVNonCollection): def __init__(self, path, environ, app, project=None): super(ScriptResource, self).__init__(path, environ) self.app = app self.new_project = False self._project = project self.project_name = self.name self.writebuffer = None if self.project_name.endswith('.py'): self.project_name = self.project_name[:-len('.py')] @property def project(self): if self._project: return self._project projectdb = self.app.config['projectdb'] if projectdb: self._project = projectdb.get(self.project_name) if not self._project: if projectdb.verify_project_name(self.project_name) and self.name.endswith('.py'): self.new_project = True self._project = { 'name': self.project_name, 'script': '', 'status': 'TODO', 'rate': self.app.config.get('max_rate', 1), 'burst': self.app.config.get('max_burst', 3), 'updatetime': time.time(), } else: raise DAVError(HTTP_FORBIDDEN) return self._project @property def readonly(self): projectdb = self.app.config['projectdb'] if not projectdb: return True if 'lock' in projectdb.split_group(self.project.get('group')) \ and self.app.config.get('webui_username') \ and self.app.config.get('webui_password'): authheader = self.environ.get("HTTP_AUTHORIZATION") if not authheader: return True authheader = authheader[len("Basic "):] try: username, password = text(base64.b64decode(authheader)).split(':', 1) except Exception as e: self.app.logger.error('wrong api key: %r, %r', authheader, e) return True if username == self.app.config['webui_username'] \ and password == self.app.config['webui_password']: return False else: return True return False def getContentLength(self): return len(utf8(self.project['script'])) def getContentType(self): return 'text/plain' def getLastModified(self): return self.project['updatetime'] def getContent(self): return BytesIO(utf8(self.project['script'])) def beginWrite(self, contentType=None): if self.readonly: self.app.logger.error('webdav.beginWrite readonly') return super(ScriptResource, self).beginWrite(contentType) self.writebuffer = ContentIO() return self.writebuffer def endWrite(self, withErrors): if withErrors: self.app.logger.error('webdav.endWrite error: %r', withErrors) return super(ScriptResource, self).endWrite(withErrors) if not self.writebuffer: return projectdb = self.app.config['projectdb'] if not projectdb: return info = { 'script': text(getattr(self.writebuffer, 'content', '')) } if self.project.get('status') in ('DEBUG', 'RUNNING'): info['status'] = 'CHECKING' if self.new_project: self.project.update(info) self.new_project = False return projectdb.insert(self.project_name, self.project) else: return projectdb.update(self.project_name, info) class RootCollection(DAVCollection): def __init__(self, path, environ, app): super(RootCollection, self).__init__(path, environ) self.app = app self.projectdb = self.app.config['projectdb'] def getMemberList(self): members = [] for project in self.projectdb.get_all(): project_name = utf8(project['name']) if not project_name.endswith('.py'): project_name += '.py' members.append(ScriptResource( os.path.join(self.path, project_name), self.environ, self.app, project )) return members def getMemberNames(self): members = [] for project in self.projectdb.get_all(fields=['name', ]): project_name = utf8(project['name']) if not project_name.endswith('.py'): project_name += '.py' members.append(project_name) return members class ScriptProvider(DAVProvider): def __init__(self, app): super(ScriptProvider, self).__init__() self.app = app def __repr__(self): return "weblocustScriptProvider" def getResourceInst(self, path, environ): path = os.path.normpath(path).replace('\\', '/') if path in ('/', '.', ''): path = '/' return RootCollection(path, environ, self.app) else: return ScriptResource(path, environ, self.app) config = DEFAULT_CONFIG.copy() config.update({ 'mount_path': '/dav', 'provider_mapping': { '/': ScriptProvider(app) }, 'user_mapping': {}, 'verbose': 1 if app.debug else 0, 'dir_browser': {'davmount': False, 'enable': True, 'msmount': False, 'response_trailer': ''}, }) dav_app = WsgiDAVApp(config) PKd6[ICweblocust/webui/login.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-12-10 20:36:27 # Modified on 2016-10-26 20:46:20 import base64 from flask import Response #from flask.ext import login import flask_login as login from .app import app login_manager = login.LoginManager() login_manager.init_app(app) class AnonymousUser(login.AnonymousUserMixin): def is_anonymous(self): return True def is_active(self): return False def is_authenticated(self): return False def get_id(self): return class User(login.UserMixin): def __init__(self, id, password): self.id = id self.password = password def is_authenticated(self): if not app.config.get('webui_username'): return True if self.id == app.config.get('webui_username') \ and self.password == app.config.get('webui_password'): return True return False def is_active(self): return self.is_authenticated() login_manager.anonymous_user = AnonymousUser @login_manager.request_loader def load_user_from_request(request): api_key = request.headers.get('Authorization') if api_key: api_key = api_key[len("Basic "):] try: api_key = base64.b64decode(api_key).decode('utf8') return User(*api_key.split(":", 1)) except Exception as e: app.logger.error('wrong api key: %r, %r', api_key, e) return None return None app.login_response = Response( "need auth.", 401, {'WWW-Authenticate': 'Basic realm="Login Required"'} ) @app.before_request def before_request(): if app.config.get('need_auth', False): if not login.current_user.is_active(): return app.login_response PKd6[I+aaweblocust/webui/__init__.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-02-22 23:20:40 # Modified on 2016-10-26 20:46:20 from . import app, index, debug, task, result, login PKd6[Iwweblocust/webui/task.py#!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # # Contributor: qiulimao # http://www.getqiu.com # # Created on 2014-07-16 15:30:57 # Modified on 2016-10-26 20:46:20 import socket from flask import abort, render_template, request, json from weblocust.libs import utils from .app import app @app.route('/task/') def task(taskid): """ an specific task""" if ':' not in taskid: abort(400) project, taskid = taskid.split(':', 1) taskdb = app.config['taskdb'] task = taskdb.get_task(project, taskid) if not task: abort(404) resultdb = app.config['resultdb'] if resultdb: result = resultdb.get(project, taskid) return render_template("task.html", task=task, json=json, result=result, status_to_string=app.config['taskdb'].status_to_string) @app.route('/tasks') def tasks(): """ a group of task """ rpc = app.config['scheduler_rpc'] taskdb = app.config['taskdb'] project = request.args.get('project', "") limit = int(request.args.get('limit', 100)) try: updatetime_tasks = rpc.get_active_tasks(project, limit) except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return 'connect to scheduler error', 502 tasks = {} result = [] for updatetime, task in sorted(updatetime_tasks , key=lambda x: x[0]): key = '%(project)s:%(taskid)s' % task task['updatetime'] = updatetime if key in tasks and tasks[key].get('status', None) != taskdb.ACTIVE: result.append(tasks[key]) tasks[key] = task result.extend(tasks.values()) return render_template( "tasks.html", tasks=result, status_to_string=taskdb.status_to_string ) @app.route('/tasks/') def project_tasks(project): """ a group of task """ rpc = app.config['scheduler_rpc'] taskdb = app.config['taskdb'] #project = request.args.get('project', "") limit = int(request.args.get('limit', 100)) try: updatetime_tasks = rpc.get_active_tasks(project, limit) except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return 'connect to scheduler error', 502 tasks = {} result = [] for updatetime, task in sorted(updatetime_tasks , key=lambda x: x[0]): key = '%(project)s:%(taskid)s' % task task['updatetime'] = updatetime if key in tasks and tasks[key].get('status', None) != taskdb.ACTIVE: result.append(tasks[key]) tasks[key] = task result.extend(tasks.values()) return json.dumps({"tasks":result}),200,\ {'Content-Type': 'application/json'} #return render_template( # "tasks.html", # tasks=result, # status_to_string=taskdb.status_to_string #) @app.route('/active_tasks') def active_tasks(): rpc = app.config['scheduler_rpc'] taskdb = app.config['taskdb'] project = request.args.get('project', "") limit = int(request.args.get('limit', 100)) try: tasks = rpc.get_active_tasks(project, limit) except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return '{}', 502, {'Content-Type': 'application/json'} result = [] for updatetime, task in tasks: task['updatetime'] = updatetime task['updatetime_text'] = utils.format_date(updatetime) if 'status' in task: task['status_text'] = taskdb.status_to_string(task['status']) result.append(task) return json.dumps(result), 200, {'Content-Type': 'application/json'} app.template_filter('format_date')(utils.format_date) PKd6[IA!weblocust/webui/static/result.css/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-10-22 22:38:45 */ /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-07-16 19:18:30 */ .top-bar { padding: 10px 15px 2px 15px; height: 46px; background-color: #f5f5f5; border-bottom: 1px solid #ddd; position: relative; } .top-bar h1 { margin: 0 0 10px 0; font-size: 18px; } .top-bar .btn-group { margin: 8px 10px 0 0; position: absolute; right: 0; top: 0; } .pagination-wrap { text-align: right; padding-right: 15px; } table { border-bottom: 1px solid #ddd; } table td { word-break: break-all; } PKd6[IiZ~# weblocust/webui/static/task.less/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-07-16 19:20:30 */ @import "variable"; .base-info { padding: 10px 15px 2px 15px; background-color: #f5f5f5; border-bottom: 1px solid #ddd; } .more-info { padding: 10px 15px; } .more-info dd { display: block; font-family: monospace; white-space: pre; word-break: break-all; word-wrap: break-word; margin: 1em 0px; } .status_mix(@color: lighten(black, 50%)) { border: solid 1px darken(@color, 10%); padding: 1px 5px 0 5px; background: @color; color: white; } .status { &-1 { .status_mix(@blue); } &-2 { .status_mix(@green); } &-3 { .status_mix(@red); } &-4 { .status_mix; } } .url { font-size: 120%; text-decoration: underline; } .callback { color: @orange; font-weight: bold; &:hover, &:focus { color: darken(@orange, 10%); } } dt .glyphicon-ok { color: @green; } dt .glyphicon-remove { color: @red; } PKd6[I,,!weblocust/webui/static/tasks.less/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-07-18 23:20:46 */ @import "variable"; @import "task"; .tasks { margin: 0; padding: 0; list-style-type: none; li { .base-info; &:nth-child(even) { background-color: white; } } .url { display: inline-block; vertical-align: bottom; max-width: 40em; overflow: hidden; white-space: nowrap; text-overflow: ellipsis; } .update-time { font-weight: bold; } } PKd6[IUF"weblocust/webui/static/result.less/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-10-22 22:38:45 */ @import "variable"; .top-bar { padding: 10px 15px 2px 15px; height: 46px; background-color: #f5f5f5; border-bottom: 1px solid #ddd; position: relative; h1 { margin: 0 0 10px 0; font-size: 18px; } .btn-group { margin: 8px 10px 0 0; position: absolute; right: 0; top: 0; a.btn { } } } .pagination-wrap { text-align: right; padding-right: 15px; } table { border-bottom: 1px solid #ddd; td { word-break: break-all; } } PKd6[Is`j#!weblocust/webui/static/index.less/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-02-23 00:28:30 */ @import "variable"; h1 { margin-top: 5px; } header .alert { position: absolute;; width: 50rem; left: 50%; margin-left: -25rem; } .queue-info { th, td { text-align: center; border: 1px solid #ddd; } } .projects { min-width: 850px; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd; .project-group { width: 80px; } .project-name { font-weight: bold; } .project-status { width: 100px; } .project-status-span(@color) { border: solid 1px darken(@color, 10%); padding: 1px 5px 0 5px; background: @color; color: white; } .project-status>span { .project-status-span(lighten(black, 50%)); } span.status-TODO { .project-status-span(@orange); } span.status-STOP { .project-status-span(@red); } span.status-CHECKING { .project-status-span(darken(@yellow, 10%)); } span.status-DEBUG { .project-status-span(@blue); } span.status-RUNNING { .project-status-span(@green); } .project-rate { width: 110px; } .project-time { width: 110px; } th.project-progress { position: relative; span { position: absolute; } } td.project-progress { position: relative; min-width: 5%; &.progress-all { min-width: 10%; } .progress { position: relative; margin: 0; background-color: #aaa; .progress-text { width: 100%; text-align: center; position: absolute; font-weight: bold; color: #fff; pointer-events: none; } .progress-bar { -webkit-transition: none; transition: none; } } } .project-actions { width: 200px; } } .global-btn { margin-top: -5px; padding: 10px 10px 10px 10px; .create-btn-div { float: right; } .active-btn-div { float: left; } } PKd6[IKDD weblocust/webui/static/tasks.css/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-07-18 23:20:46 */ /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-07-16 19:18:30 */ /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-07-16 19:20:30 */ .base-info { padding: 10px 15px 2px 15px; background-color: #f5f5f5; border-bottom: 1px solid #ddd; } .more-info { padding: 10px 15px; } .more-info dd { display: block; font-family: monospace; white-space: pre; word-break: break-all; word-wrap: break-word; margin: 1em 0px; } .status-1 { border: solid 1px #3071a9; padding: 1px 5px 0 5px; background: #428bca; color: white; } .status-2 { border: solid 1px #449d44; padding: 1px 5px 0 5px; background: #5cb85c; color: white; } .status-3 { border: solid 1px #c9302c; padding: 1px 5px 0 5px; background: #d9534f; color: white; } .status-4 { border: solid 1px #666666; padding: 1px 5px 0 5px; background: #808080; color: white; } .url { font-size: 120%; text-decoration: underline; } .callback { color: #f0ad4e; font-weight: bold; } .callback:hover, .callback:focus { color: #ec971f; } dt .glyphicon-ok { color: #5cb85c; } dt .glyphicon-remove { color: #d9534f; } .tasks { margin: 0; padding: 0; list-style-type: none; } .tasks li { padding: 10px 15px 2px 15px; background-color: #f5f5f5; border-bottom: 1px solid #ddd; } .tasks li:nth-child(even) { background-color: white; } .tasks .url { display: inline-block; vertical-align: bottom; max-width: 40em; overflow: hidden; white-space: nowrap; text-overflow: ellipsis; } .tasks .update-time { font-weight: bold; } PKd6[IH{MMweblocust/webui/static/debug.js// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: // Author: Binux // http://binux.me // Created on 2014-02-23 15:19:19 window.SelectorHelper = (function() { var helper = $('#css-selector-helper'); function merge_name(p) { var features = p.features; var element_name = ''; features.forEach(function(f) { if (f.selected) element_name += f.name; }); if (element_name === '') { return p.tag; } return element_name; } function merge_pattern(path, end) { var pattern = ''; var prev = null; path.forEach(function(p, i) { if (end >= 0 && i > end) { return; } if (p.invalid) { prev = null; } else if (p.selected) { if (prev) { pattern += ' >'; } var element_pattern = ''; p.features.forEach(function(f) { if (f.selected) { element_pattern += f.pattern; } }); if (element_pattern === '') { element_pattern = '*'; } pattern += ' '+element_pattern; prev = p; } else { prev = null; } }) if (pattern === '') { pattern = '*'; } return pattern.trim(); } function selector_changed(path) { $("#tab-web iframe").get(0).contentWindow.postMessage({ type: "heightlight", css_selector: merge_pattern(path), }, '*'); } var current_path = null; function render_selector_helper(path) { helper.find('.element').remove(); var elements = []; $.each(path, function(i, p) { var span = $('').addClass('element').data('info', p); $('').text(p.name).appendTo(span); if (p.selected) span.addClass('selected'); if (p.invalid) span.addClass('invalid'); var ul = $('
    '); $.each(p.features, function(i, f) { var li = $('
  • ').text(f.name).data('feature', f); if (f.selected) li.addClass('selected'); li.appendTo(ul); // feature on click li.on('click', function(ev) { ev.stopPropagation(); var $this = $(this); var f = $this.data('feature'); if (f.selected) { f.selected = false; $this.removeClass('selected'); } else { f.selected = true; $this.addClass('selected'); } var element = $this.parents('.element'); if (!p.selected) { p.selected = true; element.addClass('selected'); } element.find('.element-name').text(merge_name(p)); selector_changed(path); }); }); ul.appendTo(span); span.on('mouseover', function(ev) { var xpath = []; $.each(path, function(i, _p) { xpath.push(_p.xpath); if (_p === p) { return false; } }); $("#tab-web iframe")[0].contentWindow.postMessage({ type: 'overlay', xpath: '/' + xpath.join('/'), }, '*'); }) // path on click span.on('click', function(ev) { ev.stopPropagation(); var $this = $(this); var p = $this.data('info'); if (p.selected) { p.selected = false; $this.removeClass('selected'); } else { p.selected = true; $this.addClass('selected'); } $this.find('.element-name').text(merge_name($this.data('info'))); selector_changed(path); }); elements.push(span); }); helper.prepend(elements); adjustHelper(); selector_changed(path); } function adjustHelper() { while (helper[0].scrollWidth > helper.width()) { var e = helper.find('.element:visible:first'); if (e.length == 0) { return; } e.addClass('invalid').data('info')['invalid'] = true; } } var tab_web = $('#tab-web'); return { init: function() { var _this = this; _this.clear(); window.addEventListener("message", function(ev) { if (ev.data.type == "selector_helper_click") { console.log(ev.data.path); render_selector_helper(ev.data.path); current_path = ev.data.path; } }); $("#J-enable-css-selector-helper").on('click', function() { _this.clear(); $("#tab-web iframe")[0].contentWindow.postMessage({ type: 'enable_css_selector_helper' }, '*'); _this.enable(); }); $("#task-panel").on("scroll", function(ev) { if (!helper.is(':visible')) { return; } if ($("#debug-tabs").position().top < 0) { helper.addClass('fixed'); tab_web.addClass('fixed'); } else { helper.removeClass('fixed'); tab_web.removeClass('fixed'); } }); // copy button var input = helper.find('.copy-selector-input'); input.on('focus', function(ev) { $(this).select(); }); helper.find('.copy-selector').on('click', function(ev) { if (!current_path) { return; } if (input.is(':visible')) { input.hide(); helper.find('.element').show(); } else { helper.find('.element').hide(); input.val(merge_pattern(current_path)).show(); } }); // add button helper.find('.add-to-editor').on('click', function(ev) { Debugger.python_editor_replace_selection(merge_pattern(current_path)); }); }, clear: function() { current_path = null; helper.hide(); helper.removeClass('fixed'); tab_web.removeClass('fixed'); helper.find('.element').remove(); }, enable: function() { helper.show(); helper.find('.copy-selector-input').hide(); if ($("#debug-tabs").position().top < 0) { helper.addClass('fixed'); tab_web.addClass('fixed'); } else { helper.removeClass('fixed'); tab_web.removeClass('fixed'); } }, } })(); window.Debugger = (function() { var tmp_div = $('
    '); function escape(text) { return tmp_div.text(text).html(); } window.addEventListener("message", function(ev) { if (ev.data.type == "resize") { $("#tab-web iframe").height(ev.data.height+60); } }); return { init: function() { //init resizer this.splitter = $(".debug-panel:not(:first)").splitter().data('splitter') .trigger('init') .on('resize-start', function() { $('#left-area .overlay').show(); }) .on('resize-end', function() { $('#left-area .overlay').hide(); }); //codemirror CodeMirror.keyMap.basic.Tab = 'indentMore'; this.init_python_editor($("#python-editor")); this.init_task_editor($("#task-editor")); this.bind_debug_tabs(); this.bind_run(); this.bind_save(); this.bind_others(); // css selector helper SelectorHelper.init(); }, not_saved: false, init_python_editor: function($el) { var _this = this; this.python_editor_elem = $el; var cm = this.python_editor = CodeMirror($el[0], { value: script_content, mode: "python", indentUnit: 4, lineWrapping: true, styleActiveLine: true, autofocus: true }); cm.on('focus', function() { $el.addClass("focus"); }); cm.on('blur', function() { $el.removeClass("focus"); }); cm.on('change', function() { _this.not_saved = true; }); window.addEventListener('beforeunload', function(e) { if (_this.not_saved) { var returnValue = "You have not saved changes."; (e || window.event).returnValue = returnValue; return returnValue; } }); }, python_editor_replace_selection: function(content) { this.python_editor.getDoc().replaceSelection(content); }, auto_format: function(cm) { var pos = cm.getCursor(true); CodeMirror.commands.selectAll(cm); cm.autoFormatRange(cm.getCursor(true), cm.getCursor(false)); cm.setCursor(pos); }, format_string: function(value, mode) { var div = document.createElement('div'); var cm = CodeMirror(div, { value: value, mode: mode }); this.auto_format(cm); return cm.getDoc().getValue(); }, init_task_editor: function($el) { var cm = this.task_editor = CodeMirror($el[0], { value: task_content, mode: "application/json", indentUnit: 2, lineWrapping: true, styleActiveLine: true }); this.auto_format(cm); cm.getDoc().clearHistory(); cm.on('focus', function() { $el.addClass("focus"); }); cm.on('blur', function() { $el.removeClass("focus"); }); }, bind_debug_tabs: function() { var _this = this; $('#tab-control > li[data-id]').on('click', function() { $('#tab-control > li[data-id]').removeClass('active'); var name = $(this).addClass('active').data('id'); $('#debug-tabs .tab').hide(); $('#debug-tabs #'+name).show(); }); $("#tab-control li[data-id=tab-html]").on('click', function() { if (!!!$("#tab-html").data("format")) { var html_styled = ""; CodeMirror.runMode(_this.format_string($("#tab-html pre").text(), 'text/html'), 'text/html', function(text, classname) { if (classname) html_styled += ''+escape(text)+''; else html_styled += escape(text); }); $("#tab-html pre").html(html_styled); $("#tab-html").data("format", true); } }); }, bind_run: function() { var _this = this; $('#run-task-btn').on('click', function() { _this.run(); }); $('#undo-btn').on('click', function(ev) { _this.task_editor.execCommand('undo'); }); $('#redo-btn').on('click', function(ev) { _this.task_editor.execCommand('redo'); }); }, bind_save: function() { var _this = this; $('#save-task-btn').on('click', function() { var script = _this.python_editor.getDoc().getValue(); $('#right-area .overlay').show(); $.ajax({ type: "POST", url: location.pathname+'/save', data: { script: script }, success: function(data) { console.log(data); _this.python_log(''); _this.python_log("saved!"); _this.not_saved = false; $('#right-area .overlay').hide(); }, error: function(xhr, textStatus, errorThrown) { console.log(xhr, textStatus, errorThrown); _this.python_log("save error!\n"+xhr.responseText); $('#right-area .overlay').hide(); } }); }); }, bind_follows: function() { var _this = this; $('.newtask').on('click', function() { if ($(this).next().hasClass("task-show")) { $(this).next().remove(); return; } var task = $(this).after('
    ').data("task"); task = JSON.stringify(window.newtasks[task], null, ' '); CodeMirror.runMode(task, 'application/json', $(this).next().find('pre')[0]); }); $('.newtask .task-run').on('click', function(event) { event.preventDefault(); event.stopPropagation(); var task = $(this).parents('.newtask').data("task"); task = JSON.stringify(window.newtasks[task], null, ' '); _this.task_editor.setValue(task); _this.run(); }); }, bind_others: function() { var _this = this; $('#python-log-show').on('click', function() { if ($('#python-log pre').is(":visible")) { $('#python-log pre').hide(); $(this).height(8); } else { $('#python-log pre').show(); $(this).height(0); } }); $('.webdav-btn').on('click', function() { _this.toggle_webdav_mode(this); }) }, render_html: function(html, base_url, block_script, resizer, selector_helper) { if (html === undefined) { html = ''; } html = html.replace(/(\s)src=/g, "$1____src____="); var dom = document.createElement('html'); dom.innerHTML = html; if (block_script) { $(dom).find('script').attr('type', 'text/plain'); } if (resizer) { $(dom).find('body').append(' PKd6[IF.Cweblocust/webui/static/templates/index_part/system_status.part.html
    系统状态
    20% in 5min 40% in 1h 60% in 1day 80% in all
    • scheduler2fetcher: [$ queues.scheduler2fetcher $]
    • fetcher2processor: [$ queues.fetcher2processor $]
    • processor2result: [$ queues.processor2result $]
    • status_queue: [$ queues.status_queue $]
    PKd6[I?L  +weblocust/webui/templates/angularindex.html Webloust Dash Board
    PKd6[I籞Y Y $weblocust/webui/templates/tasks.html 活跃任务
      {% for task in tasks | sort(reverse=True, attribute='updatetime') %}
    1. {% if task.status %} {{ status_to_string(task.status) }} {% elif task.track %} {% set fetchok = task.track.fetch and task.track.fetch.ok %} {% set processok = task.track.process and task.track.process.ok %} {%- if not fetchok -%} FETCH_ERROR {%- elif not processok -%} PROCESS_ERROR {%- endif -%} {% else %} ERROR {% endif %} {{ task.project }} > {{ task.url }} {{ task.updatetime | format_date }} {% if task.track and task.track.fetch %} {{- '%.1f' | format(task.track.fetch.time * 1000) }}+{{ '%.2f' | format(task.track.process.time * 1000 if task.track and task.track.process else 0) }}ms {% endif %} {% if task.track and task.track.process %} +{{ task.track.process.follows | int }} {% endif %}
    2. {% endfor %}
PKd6[I)ƀ#weblocust/webui/templates/task.html Task - {{ task.project }}:{{ task.taskid }} - weblocust

{{ status_to_string(task.status) }} {{ task.project }}.{{ task.process.callback }} > {{ task.url }} {% if task.status in (2, 3, 4) %} ({{ task.lastcrawltime | format_date }} crawled ) {% else %} ({{ task.updatetime | format_date }} updated ) {% endif %}

taskid
{{ task.taskid }}
lastcrawltime
{{ task.lastcrawltime }} ({{ task.lastcrawltime | format_date }})
updatetime
{{ task.updatetime }} ({{ task.updatetime | format_date }})
# if task.schedule and task.schedule.exetime
exetime
{{ task.schedule.exetime }} ({{ task.schedule.exetime | format_date }})
# endif # if task.track and task.track.fetch
track.fetch {{ (task.track.fetch.time * 1000) | round(2) }}ms
{{ json.dumps(task.track.fetch, indent=2, ensure_ascii=False) }}
# endif # if task.track and task.track.process
track.process {{ (task.track.process.time * 1000) | round(2) }}ms # if task.track.process.follows +{{ task.track.process.follows | int }} # endif
#- if task.track.process.exception {{- task.track.process.exception or '' }} # endif #- if task.track.process.logs {{- task.track.process.logs or '' }} # endif {{- json.dumps(task.track.process, indent=2, ensure_ascii=False) -}}
# endif
#- set not_shown_keys = ('status', 'url', 'project', 'taskid', 'lastcrawltime', 'updatetime', 'track', ) #- for key, value in task.items() if key not in not_shown_keys
{{ key }}
{{ json.dumps(value, indent=2, ensure_ascii=False) if value is mapping else value }}
#- endfor
# if result and result.get('result'):
result
{{ json.dumps(result['result'], indent=2, ensure_ascii=False) }}
# endif
PKd6[I Ob%b%$weblocust/webui/templates/index.html 爬虫控制台

爬虫任务控制抬

scheduler ??? fetcher ??? processor ??? result_worker
??? + ???
{% if config.scheduler_rpc is not none %} 最近活跃任务列表 {% endif %}
{% for project in projects %} {% endfor %}
分组 工程名 状态 速度/并发 平均耗时  状态               操作
{{ project['group'] or '' }} {{ project['name'] }} {{ project['status'] }} {{ project['rate'] }}/{{ project['burst'] }}
5m
1h
1d
all
# if config.scheduler_rpc is not none: 活跃任务 # endif # if config.resultdb: 数据结果 # endif
PKd6[IGvp%weblocust/webui/templates/result.html Results - {{ project }} - weblocust

{{ project }} - Results

# set common_fields, results = result_formater(results) # for field in common_fields|sort # endfor # for result in results # for field in common_fields|sort # endfor # endfor
url {{ field }} ...
{{ result.url }} {{ json.dumps(result.result_formated[field], ensure_ascii=False) | truncate(100, True) }} {{ json.dumps(result.others, ensure_ascii=False) | truncate(100, True) }}
    # set current_page = int(offset/limit) + (1 if offset%limit else 0) # set total_page = int(count/limit) + (1 if count%limit else 0)
  • «
  • # set prev = 0 # for i in range(0, total_page): # if abs(i-0) < 2 or abs(i-total_page) < 3 or -2 < i-current_page < 5: # set prev = i
  • {{ i + 1 }}
  • # elif prev == i-1:
  • # endif # endfor
  • = total_page else "" }}"> »
PKd6[Ikwj>>$weblocust/webui/templates/debug.html {{ project_name }} - Debugger
_> {{ project_name }}      
WebDAV Mode
run
{#
__callback__ > __url__
#}
  • msg
  • follows
  • html
  • web
  • SelectorHelper
save
PKd6[ICzEE%weblocust/webui/templates/helper.html PKd6[Iq% #weblocust/webui/templates/helper.js// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: // Author: Binux // http://binux.me // Created on 2014-03-16 11:05:05 (function() { var loaded = false; var start_time = (new Date()).getTime(); function resize() { if (!loaded) parent.postMessage({type: 'resize', height: document.body.scrollHeight}, '*'); } window.addEventListener('load', function() { resize(); loaded = true; }); setTimeout(resize, 5000); setTimeout(resize, 10000); setTimeout(resize, 20000); setTimeout(resize, 30000); var css_helper_enabled = false; window.addEventListener("message", function(ev) { if (!css_helper_enabled && ev.data.type == "enable_css_selector_helper") { var script = document.createElement("script"); script.src = "//{{ host }}/static/css_selector_helper.js"; document.body.appendChild(script); css_helper_enabled = true; } }, false); document.addEventListener('click', function(ev) { ev.preventDefault(); }); })(); PK]Ic5)weblocust-1.0.2.dist-info/DESCRIPTION.rstweblocust ======== A Powerful Spider(Web Crawler) System in Python based on **pyspider**. - Write script in Python - more Powerful WebUI with script editor, task monitor, project manager and result viewer than weblocust - [MongoDB](https://www.mongodb.org/), as database backend - [RabbitMQ](http://www.rabbitmq.com/), [Beanstalk](http://kr.github.com/beanstalkd/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue - Task priority, retry, periodical, recrawl by age, etc... - Distributed architecture, Crawl Javascript pages, Python 2&3, etc... release Note ----------- 虽然`pyspider`这个框架是一个国人写的。但是他在英国工作,英文牛逼。`pyspider`在`python`的爬虫方面不仅仅只在国内有名气。 在国外也有很多人使用。所以作者没有想过要专门写一份中文的文档。在他的博客当中有一些早期版本的介绍和使用。虽然现在更新得比较快, 但是使用方式上基本没怎么变。内部的结构可能有所改变。 `pyspider`作者昵称叫bunix,感觉作者很牛,很博学,。再此表示敬佩. `weblocust` 是我根据我们的需求在`pyspider`上做了一些改进,使得更加符合我们的需求。`pyspider`原本支持很多`resultdb` 我仅仅在`mongodb` 作为`resultdb`之上做优化。如果您采用`mysql`存储,可能将不会有`weblocust`的新特性。 主要的改进: * `webui` 部分的改进。这部分实际上bunix已经做得很好了。为了有更好的操控体验和显示效果,我更改了这个模块的大部分内容。 * 原先的`js`,`css`等文件都放在云端,我将它放在了本地。我觉得虽然没网爬虫不能用,但是有些时候我们也需要浏览结果。 * 更改了`mongodb`存储`result`的结构。我觉`mongodb`的`schemaless`恰好解决爬虫字段变化大的问题,所以应该充分利用这样的特性,因此没有必要和`mysql`做统一。 * 对网页内容提取增加了`xpath`方法。 * `response`对`scrapy`部分兼容,因为我觉得`scrapy`的`linkextractor`很好用,如果你运行的`python`版本是2.7,那你可以使用`scrapy`的`linkextractor`。 * 加入数据清洗模块`cleaner`.这个模块的实现方式受`scrapy`的启发。 * 提供`OnePageHasManyItem`,`OneItemHasManySubItem`的一站式解决方案。尤其适合博客的评论,论坛回帖等网页。 * 提供灵活的存储方式,目前`pyspider`一旦运行只能采取一种`result_worker`使得存储相当不灵活。`weblocust`当中您可以在任何一个结果当中定义自己的存储方式。 * 增加了一些开发者使用的小工具,比如自动在文件中生成修改时间,添加新的作者,自动部署文档等等 * 修复了一些bug 关于文档: 这份文档潜在的读者是中国人,所以文档就在bunix的文档之上修改。中文部分是我新加的,英文部分有少许修改或者添加。另外我将文档和代码中的`pyspider`都换成了`weblocust`并不是想掩盖`weblocust` 是基于`pyspider`,仅仅是为了统一工程命名.源代码当中`author`一栏始终留着`binux`的位置,我把自己加在了`contributor`一栏. Sample Code ----------- ```python from weblocust.libs.base_handler import * from weblocust.libs.useragent import IphoneSafari,LinuxChrome from weblocust.libs.cleaners import TakeFirst,JoinCleaner,StripBlankMoreThan2 from weblocust.libs.cleaners import reduceclean,mapclean,mapreduce class Handler(BaseHandler): crawl_config = { 'headers': {'User-Agent': LinuxChrome}, "cookie":"a=123", } @every(minutes=24 * 60) def on_start(self): self.crawl('http://scrapy.org/', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): self.crawl(each.attr.href, callback=self.detail_page) def detail_page(self, response): return { "url": response.url, "title": response.doc('title').text(), } def on_result__detail_page(self,result): """ you can save the results on your own demand """ pass ``` WebUI --------- ![Demo Img] Installation you can install weblocust in 2 ways ------------ 1. the most convenient way `pip install weblocust` 2. install from source code `git clone https://github.com/qiulimao/weblocust.git` then `$python setup.py install` then run `weblocust mkconfig` to generate simple configure file. finally: run command `weblocust -c generatedfilename`, visit [http://localhost:5000/](http://localhost:5000/) Contribute ---------- TODO ---- ### next version * keep in space ### more - [x] edit script with vim via [WebDAV](http://en.wikipedia.org/wiki/WebDAV) License ------- Licensed under the Apache License, Version 2.0 [Demo Img]: imgs/demo.png [Issue]: https://github.com/qiulimao/webocust/issues [Demo Img]: docs/imgs/demo.png [Issue]: https://github.com/qiulimao/webocust/issues PK]I22*weblocust-1.0.2.dist-info/entry_points.txt[console_scripts] weblocust = weblocust.run:main PK]Iq'weblocust-1.0.2.dist-info/metadata.json{"classifiers": ["Development Status :: 4 - Beta", "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.6", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "License :: OSI Approved :: Apache Software License", "Intended Audience :: Developers", "Operating System :: OS Independent", "Environment :: Web Environment", "Topic :: Internet :: WWW/HTTP", "Topic :: Software Development :: Libraries :: Application Frameworks", "Topic :: Software Development :: Libraries :: Python Modules"], "extensions": {"python.commands": {"wrap_console": {"weblocust": "weblocust.run:main"}}, "python.details": {"contacts": [{"email": "qiulimao@getqiu.com", "name": "qiulimao", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}, "project_urls": {"Home": "https://github.com/qiulimao/weblocust"}}, "python.exports": {"console_scripts": {"weblocust": "weblocust.run:main"}}}, "extras": ["all", "test"], "generator": "bdist_wheel (0.26.0)", "keywords": ["scrapy", "crawler", "spider", "webui", "pyspider", "weblocust"], "license": "Apache License, Version 2.0", "metadata_version": "2.0", "name": "weblocust", "run_requires": [{"requires": ["Flask (>=0.10)", "Flask-Login (>=0.2.11)", "Jinja2 (>=2.7)", "chardet (>=2.2)", "click (>=3.3)", "cssselect (>=0.9)", "lxml", "mkdocs", "mongoengine", "pycurl", "pymongo", "pyquery", "requests (>=2.2)", "six (>=1.5.0)", "sqlalchemy", "tornado (>=3.2)", "u-msgpack-python (>=1.6)", "wsgidav"]}, {"extra": "all", "requires": ["SQLAlchemy (>=0.9.7)", "amqp (>=1.3.0)", "beanstalkc", "elasticsearch", "kombu", "mysql-connector-python (>=1.2.2)", "pika (>=0.9.14)", "psycopg2", "pymongo (>=2.7.2)", "redis"]}, {"extra": "test", "requires": ["coverage", "easywebdav", "httpbin", "pyproxy (>=0.1.6)", "unittest2 (>=0.5.1)"]}], "summary": "A more Powerful Spider System in Python based on pyspider", "version": "1.0.2"}PK]I.3'weblocust-1.0.2.dist-info/top_level.txtsite weblocust PK]I''\\weblocust-1.0.2.dist-info/WHEELWheel-Version: 1.0 Generator: bdist_wheel (0.26.0) Root-Is-Purelib: true Tag: py2-none-any PK]IQ"weblocust-1.0.2.dist-info/METADATAMetadata-Version: 2.0 Name: weblocust Version: 1.0.2 Summary: A more Powerful Spider System in Python based on pyspider Home-page: https://github.com/qiulimao/weblocust Author: qiulimao Author-email: qiulimao@getqiu.com License: Apache License, Version 2.0 Keywords: scrapy crawler spider webui pyspider weblocust Platform: UNKNOWN Classifier: Development Status :: 4 - Beta Classifier: Programming Language :: Python :: 2 Classifier: Programming Language :: Python :: 2.6 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: License :: OSI Approved :: Apache Software License Classifier: Intended Audience :: Developers Classifier: Operating System :: OS Independent Classifier: Environment :: Web Environment Classifier: Topic :: Internet :: WWW/HTTP Classifier: Topic :: Software Development :: Libraries :: Application Frameworks Classifier: Topic :: Software Development :: Libraries :: Python Modules Requires-Dist: Flask (>=0.10) Requires-Dist: Flask-Login (>=0.2.11) Requires-Dist: Jinja2 (>=2.7) Requires-Dist: chardet (>=2.2) Requires-Dist: click (>=3.3) Requires-Dist: cssselect (>=0.9) Requires-Dist: lxml Requires-Dist: mkdocs Requires-Dist: mongoengine Requires-Dist: pycurl Requires-Dist: pymongo Requires-Dist: pyquery Requires-Dist: requests (>=2.2) Requires-Dist: six (>=1.5.0) Requires-Dist: sqlalchemy Requires-Dist: tornado (>=3.2) Requires-Dist: u-msgpack-python (>=1.6) Requires-Dist: wsgidav Provides-Extra: all Requires-Dist: SQLAlchemy (>=0.9.7); extra == 'all' Requires-Dist: amqp (>=1.3.0); extra == 'all' Requires-Dist: beanstalkc; extra == 'all' Requires-Dist: elasticsearch; extra == 'all' Requires-Dist: kombu; extra == 'all' Requires-Dist: mysql-connector-python (>=1.2.2); extra == 'all' Requires-Dist: pika (>=0.9.14); extra == 'all' Requires-Dist: psycopg2; extra == 'all' Requires-Dist: pymongo (>=2.7.2); extra == 'all' Requires-Dist: redis; extra == 'all' Provides-Extra: test Requires-Dist: coverage; extra == 'test' Requires-Dist: easywebdav; extra == 'test' Requires-Dist: httpbin; extra == 'test' Requires-Dist: pyproxy (>=0.1.6); extra == 'test' Requires-Dist: unittest2 (>=0.5.1); extra == 'test' weblocust ======== A Powerful Spider(Web Crawler) System in Python based on **pyspider**. - Write script in Python - more Powerful WebUI with script editor, task monitor, project manager and result viewer than weblocust - [MongoDB](https://www.mongodb.org/), as database backend - [RabbitMQ](http://www.rabbitmq.com/), [Beanstalk](http://kr.github.com/beanstalkd/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue - Task priority, retry, periodical, recrawl by age, etc... - Distributed architecture, Crawl Javascript pages, Python 2&3, etc... release Note ----------- 虽然`pyspider`这个框架是一个国人写的。但是他在英国工作,英文牛逼。`pyspider`在`python`的爬虫方面不仅仅只在国内有名气。 在国外也有很多人使用。所以作者没有想过要专门写一份中文的文档。在他的博客当中有一些早期版本的介绍和使用。虽然现在更新得比较快, 但是使用方式上基本没怎么变。内部的结构可能有所改变。 `pyspider`作者昵称叫bunix,感觉作者很牛,很博学,。再此表示敬佩. `weblocust` 是我根据我们的需求在`pyspider`上做了一些改进,使得更加符合我们的需求。`pyspider`原本支持很多`resultdb` 我仅仅在`mongodb` 作为`resultdb`之上做优化。如果您采用`mysql`存储,可能将不会有`weblocust`的新特性。 主要的改进: * `webui` 部分的改进。这部分实际上bunix已经做得很好了。为了有更好的操控体验和显示效果,我更改了这个模块的大部分内容。 * 原先的`js`,`css`等文件都放在云端,我将它放在了本地。我觉得虽然没网爬虫不能用,但是有些时候我们也需要浏览结果。 * 更改了`mongodb`存储`result`的结构。我觉`mongodb`的`schemaless`恰好解决爬虫字段变化大的问题,所以应该充分利用这样的特性,因此没有必要和`mysql`做统一。 * 对网页内容提取增加了`xpath`方法。 * `response`对`scrapy`部分兼容,因为我觉得`scrapy`的`linkextractor`很好用,如果你运行的`python`版本是2.7,那你可以使用`scrapy`的`linkextractor`。 * 加入数据清洗模块`cleaner`.这个模块的实现方式受`scrapy`的启发。 * 提供`OnePageHasManyItem`,`OneItemHasManySubItem`的一站式解决方案。尤其适合博客的评论,论坛回帖等网页。 * 提供灵活的存储方式,目前`pyspider`一旦运行只能采取一种`result_worker`使得存储相当不灵活。`weblocust`当中您可以在任何一个结果当中定义自己的存储方式。 * 增加了一些开发者使用的小工具,比如自动在文件中生成修改时间,添加新的作者,自动部署文档等等 * 修复了一些bug 关于文档: 这份文档潜在的读者是中国人,所以文档就在bunix的文档之上修改。中文部分是我新加的,英文部分有少许修改或者添加。另外我将文档和代码中的`pyspider`都换成了`weblocust`并不是想掩盖`weblocust` 是基于`pyspider`,仅仅是为了统一工程命名.源代码当中`author`一栏始终留着`binux`的位置,我把自己加在了`contributor`一栏. Sample Code ----------- ```python from weblocust.libs.base_handler import * from weblocust.libs.useragent import IphoneSafari,LinuxChrome from weblocust.libs.cleaners import TakeFirst,JoinCleaner,StripBlankMoreThan2 from weblocust.libs.cleaners import reduceclean,mapclean,mapreduce class Handler(BaseHandler): crawl_config = { 'headers': {'User-Agent': LinuxChrome}, "cookie":"a=123", } @every(minutes=24 * 60) def on_start(self): self.crawl('http://scrapy.org/', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): self.crawl(each.attr.href, callback=self.detail_page) def detail_page(self, response): return { "url": response.url, "title": response.doc('title').text(), } def on_result__detail_page(self,result): """ you can save the results on your own demand """ pass ``` WebUI --------- ![Demo Img] Installation you can install weblocust in 2 ways ------------ 1. the most convenient way `pip install weblocust` 2. install from source code `git clone https://github.com/qiulimao/weblocust.git` then `$python setup.py install` then run `weblocust mkconfig` to generate simple configure file. finally: run command `weblocust -c generatedfilename`, visit [http://localhost:5000/](http://localhost:5000/) Contribute ---------- TODO ---- ### next version * keep in space ### more - [x] edit script with vim via [WebDAV](http://en.wikipedia.org/wiki/WebDAV) License ------- Licensed under the Apache License, Version 2.0 [Demo Img]: imgs/demo.png [Issue]: https://github.com/qiulimao/webocust/issues [Demo Img]: docs/imgs/demo.png [Issue]: https://github.com/qiulimao/webocust/issues PK]I<< weblocust-1.0.2.dist-info/RECORDsite/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 site/conf.py,sha256=mxYXo1jazUata06HYGCMJxiboeaBpuIYu0q8jShJFuw,700 weblocust/__init__.py,sha256=-W8UwyWnkVOc6CCBVcaL1Yul8M_FwoMpuPNv1Ts4k1w,342 weblocust/logging.conf,sha256=A0SVYmtapOiUrghtRPF9KILDU0PHMMJozCwg2bVnXzM,881 weblocust/run.py,sha256=YR30IhLRE6iYlbFooxSZleIuJLPpXXjhWjhNgsyF7nI,30408 weblocust/database/__init__.py,sha256=scejeuYXO6u-LPwPQPvEgdYk-73n8R_uHV8nCMnErrk,6078 weblocust/database/basedb.py,sha256=tlU9Y46oIKt_r7D0phqu66Ghf36Zy6sYUfVVs_hgI0U,5641 weblocust/database/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 weblocust/database/base/projectdb.py,sha256=Mxo-vgPRJXF2q-wOFnaaWjENMJ1X6z-mxvA-757O_go,1911 weblocust/database/base/resultdb.py,sha256=XmGkXugHiHlZT-qEfSFatm0RMcIxaQEYxa56MPOc9hA,1396 weblocust/database/base/taskdb.py,sha256=mpnQMbTqeiNHJbINR-n_vJjgfSqoEaV3bFUUG8Eq7x4,3067 weblocust/database/elasticsearch/__init__.py,sha256=rbYQuc3iG_HiWYpwfn53wNWPZaEtIPCwHtJ_pc8NPd0,301 weblocust/database/elasticsearch/projectdb.py,sha256=GRJ7xeRzzd3f8-W07gdP6NB1k_O0krEBfEim4UkXUFk,2938 weblocust/database/elasticsearch/resultdb.py,sha256=XDIWGMGXlogyYFWdNteqWqh3ti0VIMGjas1inQnUVBU,3983 weblocust/database/elasticsearch/taskdb.py,sha256=5cG_I-IHWTfTRblW0IGDZA4g2jt8nQtH4YbGpAWtrME,5199 weblocust/database/local/__init__.py,sha256=gZbma0ZVjnp4uOPOkLB9_bNyWYVmpNT4750fYUOc2sI,301 weblocust/database/local/projectdb.py,sha256=Aiw7VSn52vEb43FQZsM43iHtJkWXfZti_nhZgfsJVtE,3129 weblocust/database/mongodb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 weblocust/database/mongodb/mongodbbase.py,sha256=QxJfClXh3a8b9ctwni0RsFijkWGubXR3M-fSwGqyZUI,1609 weblocust/database/mongodb/projectdb.py,sha256=vwwCqCkmYV4VD4529ct9BIsGiMY6HMqw4_a6yu4JEKQ,2368 weblocust/database/mongodb/resultdb.py,sha256=hMQQGIA96T2KUXJBBJdZJCc-AOWI8BOmzbpNwpLNcpo,5871 weblocust/database/mongodb/taskdb.py,sha256=_crxmvp2pirEi42okAoKkUqgPjOqFHPabSOCvKkhQ7Q,4921 weblocust/database/mysql/__init__.py,sha256=UPnMik3tzsKiv-nHb22_w15F7SxJPSdQXMn3pIyY8gk,299 weblocust/database/mysql/mysqlbase.py,sha256=ba4cbtEGcu8xK8OXXERGBVem85HFNozYsUhDkk5-S7Y,1995 weblocust/database/mysql/projectdb.py,sha256=Sm5oMGUrDbyKeBXDAvUX3OKEHMUvIKaqw4NGy76eAtw,2518 weblocust/database/mysql/resultdb.py,sha256=fTOGgZrjlDtKnuG-XqafMv-kzQwVgNmd34FWWwLtE1s,3755 weblocust/database/mysql/taskdb.py,sha256=zRA7YmPkoZTlv5_TW133hnV9G28FzAhgfzL4PGNAMZM,5024 weblocust/database/redis/__init__.py,sha256=iasQKxo4Om-yLVmKmT5e97TRbfixWLFQMukEb7LMEgY,302 weblocust/database/redis/taskdb.py,sha256=Dk-xGwMuBHONqEdasBzhCDY4BZ1B06EPPcDwi9JHRAc,5900 weblocust/database/sqlalchemy/__init__.py,sha256=E7_M973rC5ffWCqG66kqpf61Nrak32qGMjhFxGP6jKA,302 weblocust/database/sqlalchemy/projectdb.py,sha256=5amBy4yBNdFN-LebDcMQpDfvZCPqnzUa_u5wk-Ou7nQ,3948 weblocust/database/sqlalchemy/resultdb.py,sha256=EHh6hpLxa1bT_ydbAuYPWN677_lWTdqtKAT7UKb2SyY,5090 weblocust/database/sqlalchemy/sqlalchemybase.py,sha256=z6eooQSIoWzePcVagd8J-YcdbMNRcmuuxRjqdSrVIEk,1654 weblocust/database/sqlalchemy/taskdb.py,sha256=2NG__YUMg7fdz7XBGcuaq6RFnNXxqbaalADQciri6RA,6240 weblocust/database/sqlite/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 weblocust/database/sqlite/projectdb.py,sha256=g0oo5O8cUG8S363sUVxLrNd1z4JjC_5wpFbtgfm3tUQ,1953 weblocust/database/sqlite/resultdb.py,sha256=qDHyQHp4H_0jGrlcpXITkSyxhheFNq8zGYE9DMywBn0,3016 weblocust/database/sqlite/sqlitebase.py,sha256=e5Lrqi2phlV0NQPDfTJxUOBK6mrDHurRsMA9HA7VI8U,2001 weblocust/database/sqlite/taskdb.py,sha256=yJ8J7dv-wPoXsgpe4ga3ICFztCrsYRBs3eOGSJiIoQM,4136 weblocust/fetcher/__init__.py,sha256=Ti3glfNLYJIkclu6CptTx94Exqa7sknSesAdaB_GIPs,37 weblocust/fetcher/cookie_utils.py,sha256=rgMJz_iZbsuWqjtddebbKuQmoyQd-Gjv9HZurn0d3DI,958 weblocust/fetcher/phantomjs_fetcher.js,sha256=yQ3wlvFVvwS0AO_rUb24uB5exgN9jM90eyjBg3kDkqU,6613 weblocust/fetcher/tornado_fetcher.py,sha256=j1sA0i9wT-HJuIuiM3vMDsUUPvfcqIlsQPGKax-YVmA,23159 weblocust/libs/ListIO.py,sha256=vSDP3LIaLK7cqdxMPA490eYyraGg9-LYyD162cO44Vs,838 weblocust/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 weblocust/libs/base_handler.py,sha256=ErUtMB-cwdKqs6O8RccoD9kHySGDyGR-zMfG911zq-c,17200 weblocust/libs/bench.py,sha256=O5EDaDEToKyFcXzIH9v9v_5rJdoms5FvE_-6NnywAXU,8335 weblocust/libs/cleaners.py,sha256=DUVfChaeOCrSX8mb8iyZJDikdvM1Cym7E-FvpsPGuY0,2573 weblocust/libs/counter.py,sha256=__9rsZOYuHIern97em6BuZdXnKzsbx2RHO4SVz_ol8U,12599 weblocust/libs/dataurl.py,sha256=vNZLYIP1nnatB9GHApRAT-CIGPuhNtbyt8Z8jxg3T2g,1433 weblocust/libs/log.py,sha256=_yZYhPINyoIKxnWrXHFC9D74nPvEWHCg5IAoXcc4Vps,1298 weblocust/libs/multiprocessing_queue.py,sha256=4Ve0BiVfFWwznY-855yYeFgf0qi0-o5zmCYgtktYnZw,2808 weblocust/libs/pprint.py,sha256=Dg4TsT8O8JiOIdt-noKDfinZa9s5vDz6aKSNCRxEPrU,12676 weblocust/libs/response.py,sha256=yeDQxouFdhgjev-7nRrv9TAyD_VyHVhPlK13Kh5rKfs,8440 weblocust/libs/result_dump.py,sha256=y1yDP7y2AkjALEwQy5Yx0gMFIwrjxkfEOQ1ZbG-EtCs,4067 weblocust/libs/sample_handler.py,sha256=lURdNp71DxHt70YSL6hWTEsjy_2WQC9kSsaJwhYAsPM,2501 weblocust/libs/url.py,sha256=4ufqvEwonBfeBHJ7P0i5Ej2GF7iqEeDVHNhii-28-Ng,3929 weblocust/libs/useragent.py,sha256=lc9cYnkpFCcls0HOtwp6RCZzwgR-QGLu9x1uLXZgGG0,317 weblocust/libs/utils.py,sha256=WVAmDZe6cvXkGAC6HyQcuPHUNLhnZIbMbFs1hq2V6p0,11924 weblocust/libs/wsgi_xmlrpc.py,sha256=icKgSGC_sVeqbIE37n2spSln7M8QpLeWbBjsXT5vsNM,3784 weblocust/message_queue/__init__.py,sha256=dqwj_lYt_eElx5gjdwZTttE9fVK9eLfhobLsq1Fg37A,1912 weblocust/message_queue/beanstalk.py,sha256=Qs-RMft6Cl57y-9G-ttnC2L_8O-qYsP1PI-9GzFvoh8,3654 weblocust/message_queue/kombu_queue.py,sha256=T9_hi8hwoNJalJwhw81e2gp43u3HL_z00g9rOHjuhZk,3386 weblocust/message_queue/rabbitmq.py,sha256=cTd4njbQJoPq8s3tAhaOpbvhDxurgGak6eKuZl6kFUI,8811 weblocust/message_queue/redis_queue.py,sha256=zmGY3GOpx3-vr-DhnX1A2PsuvNgEHC4d9VVSA5XJgnQ,3162 weblocust/processor/__init__.py,sha256=HmoiZLGAWkcWADSLXtokSjUyZaTI7Ro_Qfxb7GNUhho,50 weblocust/processor/processor.py,sha256=1VDgfvZLiOHOS9byYBB8lf9mMQjpih9WIFnvveEvqG8,8183 weblocust/processor/project_module.py,sha256=791lFro6uzLxvzapIyo0kPOUy8O2gsQPchLEREOoE9Y,7568 weblocust/result/__init__.py,sha256=qKdvu3fKBHPVi9dUCjSws2oKGEVW14ikfT3fuLAcqyM,377 weblocust/result/result_worker.py,sha256=7Oj8VIHUL7Aw6M_j24QIC4Ac7FdM8XrCEHFyQU9i9eE,3608 weblocust/scheduler/__init__.py,sha256=uB2xaKZs51dAzalRigyjaDUJlyLQCP6eqRXYVyHWxCc,76 weblocust/scheduler/scheduler.py,sha256=ipkazvzt1tErPu-f5Eik0rXgB6xhge3hi-9cEyXFlWU,39313 weblocust/scheduler/task_queue.py,sha256=iaEVViwuSEPscFgDBjvR5CJrXJa_2bN_bKzepEx5Wog,7311 weblocust/scheduler/token_bucket.py,sha256=0WzqiqEQjdM4P04X_sgfGePIRZt5e8tfZ1xO-hMWkLc,1533 weblocust/webui/__init__.py,sha256=7-2BClKXO0BWcq9ikwAFlJFlIzPUYvrOMxh7_SQa-3M,353 weblocust/webui/app.py,sha256=xv5QECf00Ut7yfOPHzLAsTL9StSVf8HSxivdQn4-SZU,3702 weblocust/webui/bench_test.py,sha256=LtMWQ2ycYnv5jt7hXCeZkmk1YyktoarLodBDt6JQS78,1002 weblocust/webui/debug.py,sha256=x8jNUN135XEykCDOY0xdG7dqdH0fmytzqNuUtGk0c7I,9344 weblocust/webui/index.py,sha256=jjtB3oUA4KLUGljFDLfPZOzaxR2lPBqTgZnzLK3Oc-Y,5536 weblocust/webui/login.py,sha256=e-yvdLku7ks1veJhMKRX6i-ecEF_TsWHQ8Sex0PiShE,1965 weblocust/webui/result.py,sha256=egi9g1OHL3ymLIGyKWPtpK43WpujDzTrxiB2FQMLbR4,3451 weblocust/webui/task.py,sha256=Hkbvbs1QzCP-G9CriO86bykh5k1nZbriMeb0YxKhLqM,3846 weblocust/webui/webdav.py,sha256=I__13ZGGukpOm8PNB1wTiPHyfbgo11YJ_SqIx1jnlao,6251 weblocust/webui/static/css_selector_helper.js,sha256=WkYTY9N6LVl3B79axhIBXuPduW9M0h1KtTGcSfJ2KRE,7139 weblocust/webui/static/debug.css,sha256=3KDHo45BUDs4yDidhEW1FOXBvVUTowGzOJLc9Niy5eo,7210 weblocust/webui/static/debug.js,sha256=czdIOf-MwVDAAFqAboSlQjDBWryQAvp9VqxmE0Lrm_Y,19904 weblocust/webui/static/debug.less,sha256=V4C-WziPQzk97XFLYKXS7DCKFd1c-TQusJ5Bp73briQ,7063 weblocust/webui/static/index.css,sha256=5ibU7N7ihyPGIl28D3P-F8bjBVaLQA2rJDOF1F1u4Ac,2447 weblocust/webui/static/index.js,sha256=2ULe2OuwSvbiaB4YPZ1Id5wtacW5bBtlwcQ5l3yehoQ,6387 weblocust/webui/static/index.less,sha256=nhuIDfI4RmByFNTrpP8OF4qG7-NWiUS1idfk2j-HOkA,2019 weblocust/webui/static/result.css,sha256=kmDp4Fmc-FlZb8hYOQXezXfSc6pLaz69fZA2sa2V9WA,743 weblocust/webui/static/result.less,sha256=kiAiTM7p80L9XEog2MRGz2pPPbYLPy1CdgAuMTmcxHs,641 weblocust/webui/static/splitter.js,sha256=bOqBUi8FXl-hvBJvO5uwwA_XGXvy_tbyBou8xlbnMWg,9861 weblocust/webui/static/task.css,sha256=fHdfrt0q_xsDMqv4Y8ciIj-PaBv45P8xhewb8dWZ72w,1277 weblocust/webui/static/task.less,sha256=kOEIF67pEJxJ4WiiSNCHzmBAlSbs2LN6XYL07ovhDT8,1023 weblocust/webui/static/tasks.css,sha256=XeV10Cta2ufn1etgquQHnS2s9Q2S-dYuFsr84OeXN9U,1860 weblocust/webui/static/tasks.less,sha256=MFPic7aVd9gb1kIz-peFS2-C7vvzRiMg3IFf41LiJB4,556 weblocust/webui/static/variable.less,sha256=Nstrxkpzl9EhOygqxkl_rn_MfzYGCYD2H9ouvTlvSA4,545 weblocust/webui/static/css/bootstrap.min.css,sha256=MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc,122540 weblocust/webui/static/css/codemirror.min.css,sha256=ZQ6dZGaR22qgIYvDdwtN0Ff8H1G8WGJr9pLmONOEdaU,4159 weblocust/webui/static/css/dialog.min.css,sha256=VwSUqnBSfnIURedXsv8SWc7-L5fdgZIX8QRLECH7jyI,389 weblocust/webui/static/css/font-awesome.min.css,sha256=sSwc2BH1TRG_3LXiNec5NKi4p6her7hSkRf5pbtkzPg,17780 weblocust/webui/static/fonts/fontawesome-webfont.ttf,sha256=rhni5MBPKwS_AwaExMHbj69cj-PuA9HgxAkEZgiziRI,152796 weblocust/webui/static/fonts/fontawesome-webfont.woff,sha256=rbxPlettfyc4lZzw7Lw3RnL85H6FYFCo6XkfRXYjrCw,90412 weblocust/webui/static/fonts/glyphicons-halflings-regular.eot,sha256=E2NNqH2eI_jD7ZEIzhck0YOjmtBy5z4bPYy_ZG0tBAc,20127 weblocust/webui/static/fonts/glyphicons-halflings-regular.svg,sha256=QvYGWdJlwaPDD5-kKry7Vr1KU69Ng9MW1t16NpA8Q-U,108738 weblocust/webui/static/fonts/glyphicons-halflings-regular.ttf,sha256=45UEQJN1fYKvyxOJV9BqHqk2G9zwtELQahioBRr1dFY,45404 weblocust/webui/static/fonts/glyphicons-halflings-regular.woff,sha256=omOU9-3hAMoRjv8u2ghZYnWpg5uVnCJuFUOVV6WoB0I,23424 weblocust/webui/static/fonts/glyphicons-halflings-regular.woff2,sha256=_hhdEaSWdokNR7t4MxKgzaWkTEA5IUCU55V7TAQO8Rw,18028 weblocust/webui/static/image/favicon.png,sha256=G-3WoZSJcfB5cEFHFwElA4BTCfJa8LLFQtvDUktYgOk,33270 weblocust/webui/static/js/animation.js,sha256=JCUaB2w0G2Dg3-VoLJq88WOzdgvllwr2NiSQmyg2aPk,980 weblocust/webui/static/js/app.js,sha256=IgCPw8EQSsBDQ5qkG4iOEC1dghl4Odx84LO3-Eptv7I,1371 weblocust/webui/static/js/controller.js,sha256=4S5dXfdGfER5P-_nMEt47n-1bUXQ7Q_gG58tQFRL4_c,7963 weblocust/webui/static/js/derective.js,sha256=QoI-NxJxKiVV7LsdMu4pDZn2TgVE7C2lEy9v-WW7Ncc,28 weblocust/webui/static/js/filter.js,sha256=wzSINDeHEEM6lZ0tgTwBAGNH1yoLeLwrla2V-voQF5I,2546 weblocust/webui/static/js/service.js,sha256=4oGqkZjAHH3AT-nEjqgfAtlH8bW1qp0HWvKis3qWxqc,1300 weblocust/webui/static/js/core/URI.min.js,sha256=BbpHxSsq5sTkTi6CS4yyO4DBN_XpuEJwSQViDtNWp7g,41159 weblocust/webui/static/js/core/active-line.min.js,sha256=B_S_E7ppEY69iLB7bGbyEfYQrMPN8KkyI1KmuBALo84,682 weblocust/webui/static/js/core/angular-animate.min.js,sha256=PPEc19FAcSLqaoXrWZKBR2m400Vvju0eo7OtaqBDqK4,25272 weblocust/webui/static/js/core/angular-resource.min.js,sha256=OR0USbRjsYzz-c67onb0rnSGwJmS19qYK4n8GuHhBzs,4483 weblocust/webui/static/js/core/angular-route.min.js,sha256=u23hSzGDN_XE1qg0_dMJDaRgdp5ZQmcTmL3ou9cKJa0,4530 weblocust/webui/static/js/core/angular-sanitize.min.js,sha256=FmJidxOQnEEUAC5KQkqZlN7EMkNgjesYpB73JyKvQfo,5828 weblocust/webui/static/js/core/angular-touch.min.js,sha256=5RCGYincFJLZ39PVfEr7XUD2ze3YSe8fchjcQEvO5qA,3942 weblocust/webui/static/js/core/angular.min.js,sha256=6df1Ql4Whx0pjJ55sHrlKD7wZW98y9jN9txlhnOu90k,154333 weblocust/webui/static/js/core/codemirror.min.js,sha256=CP7tV6IDTAyUvJwxDB9iFoTprnIHfje-qIMjCMhqU48,115292 weblocust/webui/static/js/core/css.min.js,sha256=vLvJFSRKHW_cR3lOdn137X781j9N8YPZXaox7cOFBLk,20481 weblocust/webui/static/js/core/dialog.min.js,sha256=B0moA_BYe_yjGpkXbnACbUenK3kxk0JPJq5QSqzRs9g,1897 weblocust/webui/static/js/core/formatting.min.js,sha256=cQIOgWLJFNIFx783fkk5IqcSRn-HGTA1JtHP0pHxzEk,3267 weblocust/webui/static/js/core/htmlmixed.min.js,sha256=8LD8UYe0In19P3tsCwsz2IiYbcH0umh-xqfRuV5rNSg,2335 weblocust/webui/static/js/core/javascript.min.js,sha256=ie7c8Je1HfVE9Evvv94R9v1ABNPdloNr4QwYF9JILeY,11167 weblocust/webui/static/js/core/jquery.min.js,sha256=h0cGsrExGgcZtSZ_fRz4AwV-Nn6Urh_3v3jFRQ0w9dQ,84245 weblocust/webui/static/js/core/python.min.js,sha256=02lPIzD8fkvC33OiSZAGaarVmSgdDSOBbBsOTxua2hU,5362 weblocust/webui/static/js/core/runmode.min.js,sha256=JPJewH3ncWWj9kUZB-MnV0X3zSqH_m5SUytkfY0D72g,944 weblocust/webui/static/js/core/search.min.js,sha256=kUfF53XOFELjrBnBIVZgd1e8VhqIplltkrgPri9crwg,3016 weblocust/webui/static/js/core/searchcursor.min.js,sha256=XUolY0j6vdEcxaH8wfeXUYMKd2UQsJoarjAHQwiT368,3020 weblocust/webui/static/js/core/ui-bootstrap-tpls.js,sha256=BCvCNNYYMAwtKYlvDruDF_RWZSusvlBSbzSLRd1jI0o,260013 weblocust/webui/static/js/core/ui-bootstrap.js,sha256=5BnyVCuXBR-exHa2qFJb7NKLRO8K4N2sJYIAC6xID5w,232261 weblocust/webui/static/js/core/xml.min.js,sha256=K-cWoqBgYReGUEIfWDolUzEtmU5IIsr8It95zl-07n4,5012 weblocust/webui/static/templates/debug.html,sha256=kQsdkLPyyq4Z2-5-nwfIa5zT1x7lziqMdvqGFc-bZQU,1258 weblocust/webui/static/templates/delete-alert.html,sha256=8xAv_d9Z5bUkrW5UBmr_g-hVv_Sx4kMCPxFq2YAAobM,597 weblocust/webui/static/templates/footer.part.html,sha256=tXZtjTqRj1mzjeXdbmkzRNlTf1o4FU7eU8fgSBI9Ru0,1192 weblocust/webui/static/templates/index.html,sha256=l3mANd5xUamvkVn8826yN6VJ1QiAnaz16uLZh_R8yPU,485 weblocust/webui/static/templates/navbar.part.html,sha256=xlaR9KOnY9sk78jfQ8G05lZ7KtP7Xp2JLVa7phT6jQ8,1309 weblocust/webui/static/templates/results.html,sha256=jFZ6P1Fp8dqWoltlXKvdvijqxRr2lbd5mS0tlplflRM,3542 weblocust/webui/static/templates/tasks.html,sha256=w9NulqQk5tSzHSejDkW6PLfvF6bNoShgCGUQ3OXS2eU,2302 weblocust/webui/static/templates/index_part/create_spider.form.html,sha256=K4Vx0dP_ZX3R4x5uLlFq0NuGpolHucDjgxAdMGwu1rA,1862 weblocust/webui/static/templates/index_part/spider_status_table.html,sha256=mV0UxeFeSstYBxxR5L6_lPK77G-27-Lyi16ZN8xcntU,3679 weblocust/webui/static/templates/index_part/system_status.part.html,sha256=VZ_ptRcuawnr_JOCLDmQxb5FXvok0ffFwSR0Sqkf2v0,4058 weblocust/webui/static/templates/index_part/widgets.html,sha256=5DE_Me9KzApBOyN47cvbMkMJgrlTe5YsFX6Twrg0Rbo,3668 weblocust/webui/templates/angularindex.html,sha256=pWTIGsxND-686SRANHNywKlRYpO4nRWByIRRZl6ydzU,2319 weblocust/webui/templates/debug.html,sha256=MC4qsdCK5Ce6OpYE-ZUmAItMgkfsJqHGVQxfLDw3umY,6974 weblocust/webui/templates/helper.html,sha256=XjkxSj-6XaCKgMrkjE_OPX97bbYrrHInv2_p2p60TFE,325 weblocust/webui/templates/helper.js,sha256=BCaVCW1EmYI3IDe-jcMQVjUEyeCGei8ErJbzfc9iygk,1017 weblocust/webui/templates/index.html,sha256=278-ceziu8NRv_9oJ_1DbgFTcch1MhoOX_WNBK2u8SI,9570 weblocust/webui/templates/result.html,sha256=Ju4EYvwWOAw-3SrunD1Wl9k6SMiA2MGx8CZgosF4FCQ,3752 weblocust/webui/templates/task.html,sha256=UvnsSEYJ-VxrLm9XiqCCS7qrENw6DJiSVo4Ou9MLUv4,3968 weblocust/webui/templates/tasks.html,sha256=grr0mgIiGss9I9SqBPebLzbHdTPaOW9v6kbQP4ShGIA,2393 weblocust-1.0.2.dist-info/DESCRIPTION.rst,sha256=D7ls8UCInhau6XPJPv_kEJW6TBgZuhhl1UloEvNWdL4,5115 weblocust-1.0.2.dist-info/METADATA,sha256=fsTYkxk2WSuAJdErynzkNAKaXyu7LrpPPior8b3Jk3w,7415 weblocust-1.0.2.dist-info/RECORD,, weblocust-1.0.2.dist-info/WHEEL,sha256=JTb7YztR8fkPg6aSjc571Q4eiVHCwmUDlX8PhuuqIIE,92 weblocust-1.0.2.dist-info/entry_points.txt,sha256=OTqckAY-ItwpfyDqAMMBtmBLeZLIIPkhcAad8XUf0I4,50 weblocust-1.0.2.dist-info/metadata.json,sha256=GVS-FdABKdbwkMIJMbRx3zDinooYyw7gzbn97rrIx3c,1993 weblocust-1.0.2.dist-info/top_level.txt,sha256=LLMUGo3Pcz1a3TUZCe3MDDrx-s3UC2mpVyeWPa8WHWk,15 PKI\I site/conf.pyPKI\Isite/__init__.pyPKd6[Ibqqweblocust/logging.confPKn[Ixvvweblocust/run.pyPKK\IfVV}weblocust/__init__.pyPKd6[Id!8weblocust/result/result_worker.pyPKd6[Iyyweblocust/result/__init__.pyPKd6[IP^Bweblocust/database/__init__.pyPKd6[I[  <weblocust/database/basedb.pyPKd6[Iz z -weblocust/database/elasticsearch/projectdb.pyPKd6[Iޏ,Dweblocust/database/elasticsearch/resultdb.pyPKd6[I/--,weblocust/database/elasticsearch/__init__.pyPKd6[I@QOO*weblocust/database/elasticsearch/taskdb.pyPKd6[Ifd %+weblocust/database/mysql/projectdb.pyPKd6[IQ4F$Dweblocust/database/mysql/resultdb.pyPKd6[IH++$1weblocust/database/mysql/__init__.pyPKd6[IC_% weblocust/database/mysql/mysqlbase.pyPKd6[IG"weblocust/database/mysql/taskdb.pyPKd6[Ievv/%weblocust/database/sqlalchemy/sqlalchemybase.pyPKd6[Ill*O,weblocust/database/sqlalchemy/projectdb.pyPKd6[IlE~)<weblocust/database/sqlalchemy/resultdb.pyPKd6[I))..),Pweblocust/database/sqlalchemy/__init__.pyPKd6[I2xž``'Qweblocust/database/sqlalchemy/taskdb.pyPKd6[IO 9 9 %Fjweblocust/database/local/projectdb.pyPKd6[IA8--$vweblocust/database/local/__init__.pyPKd6[I'1xweblocust/database/sqlite/sqlitebase.pyPKd6[I"6i&Gweblocust/database/sqlite/projectdb.pyPKd6[IQ %,weblocust/database/sqlite/resultdb.pyPKd6[I%7weblocust/database/sqlite/__init__.pyPKd6[Ih)((#zweblocust/database/sqlite/taskdb.pyPKd6[I~@ @ 'weblocust/database/mongodb/projectdb.pyPKd6[IH%&hweblocust/database/mongodb/resultdb.pyPKd6[I&weblocust/database/mongodb/__init__.pyPKd6[I+hII)weblocust/database/mongodb/mongodbbase.pyPKd6[I[99$oweblocust/database/mongodb/taskdb.pyPKd6[I~..$weblocust/database/redis/__init__.pyPKd6[Ia  "Zweblocust/database/redis/taskdb.pyPKd6[In8ww$weblocust/database/base/projectdb.pyPKd6[I:MLtt#_weblocust/database/base/resultdb.pyPKd6[I#weblocust/database/base/__init__.pyPKd6[ID !Uweblocust/database/base/taskdb.pyPKd6[I!bweblocust/libs/dataurl.pyPKd6[I d==_weblocust/libs/useragent.pyPKd6[Ivmweblocust/libs/log.pyPKd6[ISV11weblocust/libs/pprint.pyPKd6[I/: Pweblocust/libs/bench.pyPK\ ]I0C0Cqweblocust/libs/base_handler.pyPKd6[I* y weblocust/libs/sample_handler.pyPKd6[Iw7171weblocust/libs/counter.pyPKd6[I'YYuweblocust/libs/url.pyPKd6[I: weblocust/libs/cleaners.pyPKd6[I3 F weblocust/libs/response.pyPKd6[Iv+weblocust/libs/__init__.pyPKd6[Iį0FF+weblocust/libs/ListIO.pyPKd6[I '*/weblocust/libs/multiprocessing_queue.pyPKd6[I..g:weblocust/libs/utils.pyPKd6[Iz˭0iweblocust/libs/wsgi_xmlrpc.pyPKd6[I3xweblocust/libs/result_dump.pyPKd6[I-%Qweblocust/processor/project_module.pyPKd6[Iq $weblocust/processor/processor.pyPKd6[IA22Yweblocust/processor/__init__.pyPKd6[I0g weblocust/scheduler/scheduler.pyPKd6[If#`weblocust/scheduler/token_bucket.pyPKd6[I1*LLfweblocust/scheduler/__init__.pyPKd6[IKُ!^gweblocust/scheduler/task_queue.pyPKd6[I_FF$,weblocust/message_queue/beanstalk.pyPKd6[I;$k"k"#weblocust/message_queue/rabbitmq.pyPKd6[IV9xx#`weblocust/message_queue/__init__.pyPKd6[IGQZ Z &weblocust/message_queue/redis_queue.pyPKd6[I#x: : &weblocust/message_queue/kombu_queue.pyPKd6[I%%%5weblocust/fetcher/__init__.pyPKd6[IxwZwZ$weblocust/fetcher/tornado_fetcher.pyPKl[I:Y&N2weblocust/fetcher/phantomjs_fetcher.jsPKd6[I9|!gLweblocust/fetcher/cookie_utils.pyPKd6[IL{ { dPweblocust/webui/result.pyPKd6[Ivv^weblocust/webui/app.pyPKd6[IigԀ$$lweblocust/webui/debug.pyPKd6[Ievweblocust/webui/index.pyPKd6[I> Lweblocust/webui/bench_test.pyPKd6[I( pkkqweblocust/webui/webdav.pyPKd6[ICweblocust/webui/login.pyPKd6[I+aaweblocust/webui/__init__.pyPKd6[Iwweblocust/webui/task.pyPKd6[IA!weblocust/webui/static/result.cssPKd6[IiZ~# weblocust/webui/static/task.lessPKd6[I,,!.weblocust/webui/static/tasks.lessPKd6[IUF"weblocust/webui/static/result.lessPKd6[Is`j#!Zweblocust/webui/static/index.lessPKd6[IKDD |weblocust/webui/static/tasks.cssPKd6[IH{MMweblocust/webui/static/debug.jsPKd6[I&!Fweblocust/webui/static/debug.lessPKd6[I+,}bweblocust/webui/static/task.cssPKd6[I[߅&&" hweblocust/webui/static/splitter.jsPKd6[I"g[!!$Ўweblocust/webui/static/variable.lessPKd6[IT?-3weblocust/webui/static/css_selector_helper.jsPKd6[IQ~** aweblocust/webui/static/debug.cssPKd6[I3PBLweblocust/webui/static/index.jsPKd6[IAD֏ weblocust/webui/static/index.cssPKd6[I|¨¨=weblocust/webui/static/fonts/glyphicons-halflings-regular.svgPKd6[I<\\=weblocust/webui/static/fonts/glyphicons-halflings-regular.ttfPKd6[I{[[>G weblocust/webui/static/fonts/glyphicons-halflings-regular.woffPKd6[I_TT4v weblocust/webui/static/fonts/fontawesome-webfont.ttfPKd6[IvalFlF? weblocust/webui/static/fonts/glyphicons-halflings-regular.woff2PKd6[I,kq#,a,a5m? weblocust/webui/static/fonts/fontawesome-webfont.woffPKd6[IXDZNN= weblocust/webui/static/fonts/glyphicons-halflings-regular.eotPKd6[I,mm& weblocust/webui/static/js/animation.jsPKd6[I # weblocust/webui/static/js/filter.jsPKd6[I] [[ 1 weblocust/webui/static/js/app.jsPKd6[Iqb&weblocust/webui/static/js/derective.jsPKd6[ImTpE'*weblocust/webui/static/js/controller.jsPKd6[I|$#weblocust/webui/static/js/service.jsPKd6[IC&#\\0(weblocust/webui/static/js/core/codemirror.min.jsPKd6[I+ǠǠ)weblocust/webui/static/js/core/URI.min.jsPKd6[Ic1]PP)weblocust/webui/static/js/core/css.min.jsPKd6[Ibii,weblocust/webui/static/js/core/dialog.min.jsPKd6[IM:9)weblocust/webui/static/js/core/xml.min.jsPKd6[IJ3nweblocust/webui/static/js/core/ui-bootstrap-tpls.jsPKd6[Iff3lweblocust/webui/static/js/core/angular-touch.min.jsPKd6[I` ,#weblocust/webui/static/js/core/search.min.jsPKd6[IWZZ-5 weblocust/webui/static/js/core/angular.min.jsPKd6[Iy)] 0]gweblocust/webui/static/js/core/formatting.min.jsPKd6[I&'y  /ntweblocust/webui/static/js/core/htmlmixed.min.jsPKd6[IG!^EE.}weblocust/webui/static/js/core/ui-bootstrap.jsPKd6[I!3k weblocust/webui/static/js/core/angular-route.min.jsPKd6[I'Ƀ6nweblocust/webui/static/js/core/angular-resource.min.jsPKd6[I 2E-weblocust/webui/static/js/core/searchcursor.min.jsPKd6[IYbb5a9weblocust/webui/static/js/core/angular-animate.min.jsPKd6[I,II,lweblocust/webui/static/js/core/jquery.min.jsPKd6[I]11weblocust/webui/static/js/core/active-line.min.jsPKd6[I I*6weblocust/webui/static/js/core/angular-sanitize.min.jsPKd6[I;,weblocust/webui/static/js/core/python.min.jsPKd6[I!8++0weblocust/webui/static/js/core/javascript.min.jsPKd6[Ik-Aweblocust/webui/static/js/core/runmode.min.jsPKd6[IT뻬,Eweblocust/webui/static/css/bootstrap.min.cssPKd6[IR)#weblocust/webui/static/css/dialog.min.cssPKd6[I~itEtE/%weblocust/webui/static/css/font-awesome.min.cssPKd6[I??-kweblocust/webui/static/css/codemirror.min.cssPKd6[I( |weblocust/webui/static/image/favicon.pngPKd6[I|l1Iweblocust/webui/static/templates/footer.part.htmlPKd6[I&+@ weblocust/webui/static/templates/tasks.htmlPKf][Ix1 weblocust/webui/static/templates/navbar.part.htmlPKd6[IUU2 weblocust/webui/static/templates/delete-alert.htmlPKd6[Iu+ weblocust/webui/static/templates/index.htmlPKd6[I/p+ weblocust/webui/static/templates/debug.htmlPKd6[IZ1LW - weblocust/webui/static/templates/results.htmlPKd6[I!FFC* weblocust/webui/static/templates/index_part/create_spider.form.htmlPKd6[I5__D1 weblocust/webui/static/templates/index_part/spider_status_table.htmlPKd6[ITT8@ weblocust/webui/static/templates/index_part/widgets.htmlPKd6[IF.C,O weblocust/webui/static/templates/index_part/system_status.part.htmlPKd6[I?L  +g_ weblocust/webui/templates/angularindex.htmlPKd6[I籞Y Y $h weblocust/webui/templates/tasks.htmlPKd6[I)ƀ#Zr weblocust/webui/templates/task.htmlPKd6[I Ob%b%$ weblocust/webui/templates/index.htmlPKd6[IGvp% weblocust/webui/templates/result.htmlPKd6[Ikwj>>$ weblocust/webui/templates/debug.htmlPKd6[ICzEE%* weblocust/webui/templates/helper.htmlPKd6[Iq% # weblocust/webui/templates/helper.jsPK]Ic5) weblocust-1.0.2.dist-info/DESCRIPTION.rstPK]I22*. weblocust-1.0.2.dist-info/entry_points.txtPK]Iq' weblocust-1.0.2.dist-info/metadata.jsonPK]I.3' weblocust-1.0.2.dist-info/top_level.txtPK]I''\\ weblocust-1.0.2.dist-info/WHEELPK]IQ" weblocust-1.0.2.dist-info/METADATAPK]I<< !weblocust-1.0.2.dist-info/RECORDPK5P!