PK! qnmqn/__init__.py__version__ = '0.1.0' PK!nmqn/_common/__init__.pyPK!D(L L nmqn/_common/config.pyimport re import yaml from urllib.robotparser import RobotFileParser def parse(path): """Parse user-prepared configuration file (yaml). Args: path (pathlib.Path/str): configuration file Returns: Config: Configuration class """ with open(path, "r") as f: config = yaml.load(f) return Config.parse(config) class Config(object): def __init__(self, name, deviceconfs): self.name = name self.deviceconfs = deviceconfs @classmethod def parse(cls, config): return cls(config["name"], [DeviceConfig.parse(config, d) for d in config["options"].keys()]) def __iter__(self): return iter(self.deviceconfs) class DeviceConfig(object): def __init__(self, name, device, robotsurl, options, nodes): self.name = name self.device = device self._robots = RobotsConfig(robotsurl) self.options = options self.nodes = nodes @classmethod def parse(cls, config, device): nodes = [Node.parse(n, device) for n in config["nodes"]] options = BrowserOptions.parse(config["options"], device) return cls(config["name"], device, config["robots"][device], options, nodes) def check_robots_txt(self, nodes): self._robots.load() for n in nodes: if not self._robots.can_fetch(n.url, self.options.useragent): raise RuntimeError("許可されていません") class BrowserOptions(object): def __init__(self, useragent, viewport): self.useragent = useragent self.viewport = viewport @classmethod def parse(cls, config, device): c = config[device] return cls(c.get("useragent", None), c.get("viewport", None)) class Node(object): def __init__(self, name, url, childs): self.name = name self.url = url self.childs = childs @classmethod def parse(cls, conf, device): childs = [ChildNode.parse(x, device) for x in conf.get("childs", [])] return cls(conf["name"], conf["url"][device], childs) class ChildNode(object): def __init__(self, name, url_regexp, childs): self.name = name self._url_regexp = re.compile(url_regexp) self._childs = childs @classmethod def parse(cls, conf, device): childs = [ChildNode.parse(x, device) for x in conf.get("childs", [])] return cls(conf["name"], conf["url_regexp"][device], childs) def parse_child_node(self, urls): """候補となるURLからノードを作る """ for url in urls: if self._url_regexp.match(url): return Node(self.name, url, self._childs) raise ValueError(f"Url Not Found.: {urls}") class RobotsConfig(object): def __init__(self, robotsurl): self._parser = RobotFileParser() self._parser.set_url(robotsurl) self._loaded = False def load(self): if self._loaded: return self._parser.read() self._loaded = True def can_fetch(self, useragent, url): return self._parser.can_fetch(useragent, url)PK!rYGnmqn/_common/path.pyimport re import time from pathlib import Path from datetime import timedelta from datetime import datetime as dt from base64 import urlsafe_b64encode def create_basepath(path, name): now = dt.now() date = now.strftime("%Y-%m-%d") unixtime = int(time.mktime(now.timetuple())) return Path(path) / name / "logs" / date / str(unixtime) def current_date(delta): return (dt.now() + timedelta(days=delta)).strftime("%Y%m%d") def current_path(basepath, date): parent = Path(basepath) / dt.strptime(date, "%Y%m%d").strftime("%Y-%m-%d") return max(parent.iterdir(), key=lambda x: int(x.stem)) def encode_css_name(css_url): id_url = identify_url(css_url) return urlsafe_b64encode(id_url.encode("utf-8")).decode("utf-8") + ".css" def identify_url(url): # 数値は日付かバージョンのことが多いので置き換える return re.sub(r"\d", "*", url.split("?")[0])PK!  nmqn/compare/__init__.pyimport yaml from pathlib import Path from difflib import unified_diff from .._common import (config as c, path as p) from ._view import ReportBuilder def execute(confpath, today, yesterday, path): config = c.parse(confpath) if not today: today = p.current_date(0) if not yesterday: yesterday = p.current_date(-1) basepath = Path(path).resolve() / config.name m = NodeManager(basepath, config) rb = ReportBuilder(basepath, f"{yesterday}-{today}") df_list = [AssetsDiffs.parse(t, y) for t, y in zip(m.iter_nodes(today), m.iter_nodes(yesterday))] for diffs in df_list: with rb.each_page(diffs) as ep: ep.build() with rb.top_page(df_list) as tp: tp.build() class NodeManager(object): def __init__(self, basepath, config): self._config = config self._basepath = basepath / "logs" def iter_nodes(self, date): basepath = p.current_path(self._basepath, date) for devcon in self._config: for node in devcon.nodes: path = basepath / devcon.device yield Reader(path / node.name, devcon) yield from (Reader(path / c.name, devcon) for c in node.childs) class AssetsDiffs(object): def __init__(self, name, device, nodename, today, yesterday, after_path, before_path): # TODO:: 引数の順番 self._today = {x.id_url: x for x in today} self._yesterday = {x.id_url: x for x in yesterday} self.name = name self.device = device self.nodename = nodename self.before_capture_path = before_path self.after_capture_path = after_path @classmethod def parse(cls, today, yesterday): return cls(today.name, today.device, today.nodename, today.iter_stylesheets(), yesterday.iter_stylesheets(), today.capture_path, yesterday.capture_path) @property def added(self): keys = set(self._today.keys()) - set(self._yesterday.keys()) return [self._today[k] for k in keys] @property def deleted(self): keys = set(self._yesterday.keys()) - set(self._today.keys()) return [self._yesterday[k] for k in keys] @property def diffs(self): keys = set(self._today.keys()) & set(self._yesterday.keys()) return [FileDiff(self._today[k], self._yesterday[k]) for k in keys] class FileDiff(object): def __init__(self, today_asset, yesterday_asset): self._t = today_asset self._y = yesterday_asset with today_asset.path.open("r") as tf, yesterday_asset.path.open("r") as yf: self.diff = "".join(unified_diff(tf.readlines(), yf.readlines())) @property def url(self): return self._t.url @property def id_url(self): return self._t.id_url class Reader(object): def __init__(self, path, deviceconfig): self._deviceconfig = deviceconfig self._path = path with (path / "result.yml").open("r") as f: self.result = yaml.load(f) self.capture_path = Path(self.result["capture_path"]).resolve() @property def name(self): return self._deviceconfig.name @property def device(self): return self._deviceconfig.device @property def nodename(self): return self.result["name"] def iter_stylesheets(self): for s in self.result["stylesheets"]: yield StyleSheetInfomation(s["url"], self._path / "stylesheets" / s["path"]) class StyleSheetInfomation(object): def __init__(self, url, path): self.url = url self.path = path @property def id_url(self): # TODO:: 数字はバージョンであることが多いので塗りつぶす return p.identify_url(self.url)PK!*nmqn/compare/_view.pyimport pweave from jinja2 import Environment, FileSystemLoader import shutil import os from pathlib import Path # TODO:: もっといい書き方があるはず TEMPLETES = Environment(loader=FileSystemLoader('./nmqn/compare/templetes', encoding='utf8')) class ReportBuilder(object): def __init__(self, basepath, name): self._basepath = basepath / "report" / name if os.path.exists(self._basepath): shutil.rmtree(self._basepath) self._basepath.mkdir(parents=True, exist_ok=True) def each_page(self, diffs): return EachPageBuilder(diffs, self._basepath) def top_page(self, diffs_list): return TopPageBuilder(diffs_list, self._basepath) class TopPageBuilder(object): def __init__(self, diffs_list, basepath): self._diffs_list = diffs_list self._path = basepath self._templete = TEMPLETES.get_template('top.tpl.md') def __enter__(self): self._path.mkdir(parents=True, exist_ok=True) self._prev_path = os.getcwd() os.chdir(self._path) return self def __exit__(self, *args): os.chdir(self._prev_path) del self._prev_path return False def build(self): # TODO:: mdpath = self._build_markdown("markdown.md") outpath = self._path / "index.html" pweave.weave(str(mdpath.relative_to(self._path)), output=str(outpath.absolute()), doctype="pandoc2html") def _build_markdown(self, mdname): # TODO:: md = self._templete.render({ "name": self._diffs_list[0].name, "pages": sorted([ { "name": x.nodename, "device": x.device, "path": self._path / x.device / x.nodename / "index.html", "changed": len(x.diffs), "added": len(x.added), "deleted": len(x.deleted) } for x in self._diffs_list ], key=lambda r: (r["changed"], r["added"] + r["deleted"]), reverse=True) }) mdpath = self._path / mdname with mdpath.open("w") as f: f.write(md) return mdpath class EachPageBuilder(object): def __init__(self, diffs, basepath): self._diffs = diffs self._path = basepath / diffs.device /diffs.nodename self._templete = TEMPLETES.get_template('report.tpl.md') def __enter__(self): self._path.mkdir(parents=True, exist_ok=True) self._prev_path = os.getcwd() os.chdir(self._path) return self def __exit__(self, *args): os.chdir(self._prev_path) del self._prev_path return False def build(self): mdpath = self._build_markdown("markdown.md") outpath = self._path / "index.html" pweave.weave(str(mdpath.relative_to(self._path)), output=str(outpath.absolute()), doctype="pandoc2html") def _build_markdown(self, mdname): md = self._templete.render({ "title": self._diffs.name, "added": [{"url": x.url} for x in self._diffs.added], "deleted": [{"url": x.url} for x in self._diffs.deleted], "before_path": self._copy_path(self._diffs.before_capture_path, "before"), "after_path": self._copy_path(self._diffs.after_capture_path, "after") }) mdpath = self._path / mdname with mdpath.open("w") as f: f.write(md) return mdpath def _copy_path(self, capture_path, name): path = self._path / "figures" / f"{name}.png" path.parent.mkdir(parents=True, exist_ok=True) shutil.copyfile(capture_path, path) return str(path.relative_to(self._path))PK!$nmqn/compare/templetes/report.tpl.md# {{title}} {% if added + deleted -%} ## addded/deleted {% for x in added -%} * \+ [{{ x.url }}]({{ x.url }}) {% endfor -%} {% for x in deleted -%} * \- [{{ x.url }}]({{ x.url }}) {% endfor -%} {%- else -%} {%- endif -%} {%- if changed-%} ## changed {% for x in changed %} **{{x.url}}** ```css {{ x.diff }} ``` {%- endfor -%} {%- else -%} {%- endif %} ## captures | before | after | |:----|:----| | ![{{before_path}}]({{before_path}}) | ![{{after_path}}]({{after_path}}) |PK!<(!nmqn/compare/templetes/top.tpl.md# {{name}} |name|device|changed|added|deleted| |:----|:----|:----|:----|:----| {%- for x in pages %} |[{{x.name}}]({{x.path}})|{{x.device}}|{{x.changed}}|{{x.added}}|{{x.deleted}}| {%- endfor -%}PK!ٶFnmqn/crawl/__init__.pyfrom .._common import (config as c, path as p) from ._crawler import crawl_all_nodes from pathlib import Path import yaml def execute(confpath, max_tab, path, headless): config = c.parse(confpath) basepath = p.create_basepath(path, config.name) for result in crawl_all_nodes(config, max_tab, basepath, headless): saved = _save_stylesheets(basepath, result) _save_result(basepath, result, saved) def _save_stylesheets(basepath, result): saved = [] for stylesheet in result.stylesheets: path = basepath / result.device / result.node.name / "stylesheets" / p.encode_css_name(stylesheet.url) path.parent.mkdir(exist_ok=True, parents=True) with path.open("w") as f: f.write(stylesheet.text) saved.append({"url": stylesheet.url, "path": path.name}) return saved def _save_result(basepath, result, saved): path = basepath /result.device / result.node.name / "result.yml" path.parent.mkdir(exist_ok=True, parents=True) with path.open("w") as f: f.write(yaml.dump({ "name": result.node.name, "url": result.node.url, "redirected": result.html.url, "stylesheets": saved, "capture_path": result.capture_path }, default_flow_style=False))PK!n<$$nmqn/crawl/_crawler.pyfrom nmqn.lib import crawler from urllib.parse import urljoin from pathlib import Path def crawl_all_nodes(allconfig, max_tab, path, headless): for deviceconf in allconfig: client = CrawlClient(deviceconf, max_tab, Path(path) / deviceconf.device, headless) for node, result in client.crawl_all_nodes(deviceconf.nodes): yield NodeResponse( device=deviceconf.device, node=node, html=result.html, stylesheets=result.stylesheets, capture_path=result.capture_path) class NodeResponse(object): def __init__(self, *, device, node, html, stylesheets, capture_path): self.device = device self.node = node self.html = html self.stylesheets = stylesheets self.capture_path = capture_path class CrawlClient(object): def __init__(self, config, max_tab, path, headless): self._config = config self._max_tab = max_tab self._headless = headless self._path = path def crawl_all_nodes(self, nodes): if not nodes: return self._config.check_robots_txt(nodes) childs = [] with crawler.SyncCrawler(max_tab=self._max_tab, headless=self._headless, capture_path=self._path, options=self._config.options) as c: for node, result in zip(nodes, c.walk(nodes)): yield node, result childs += [c.parse_child_node(result.html.absolute_links) for c in node.childs] yield from self.crawl_all_nodes(childs)PK!nmqn/lib/__init__.pyPK!ێ:nmqn/lib/crawler.pyfrom pyppeteer import launch import asyncio import aiohttp from requests_html import HTML from pathlib import Path class SyncCrawler(object): def __init__(self, *, max_tab, headless=True, capture_path=None, options=None): self._headless = headless self._max_tab = max_tab self._capture_path = Path(capture_path) if capture_path else None self._options = options def __enter__(self): self._loop = asyncio.new_event_loop() self._capture_path.mkdir(exist_ok=True, parents=True) return self def __exit__(self, *exc): self._loop.close() return False def walk(self, nodes): for targets in self._each_slice(nodes, self._max_tab): result = self._loop.run_until_complete(_async_fetch( targets, headless=self._headless, capture_path=self._capture_path, options=self._options)) yield from result @staticmethod def _each_slice(iterable, n): res = [] for x in iterable: res.append(x) if len(res) >= n: yield res.copy() res.clear() if res: yield res async def _async_fetch(nodes, *, headless, capture_path, options): async with AsyncCrawler(headless=headless, options=options) as c: async def _inner(node): async with await c.open(node.url) as page: html = await page.fetch() css = await page.fetchStyleSheets() screenshot = await page.screenshot( capture_path / node.name / "capture.png") return (html, css, screenshot) results = await asyncio.gather(*[_inner(u) for u in nodes]) # タイムアウトしてしまうので、ブラウザ終了後にCSSを取得 return [Response(h, await fetch_stylesheets(c), s) for h, c, s in results] async def fetch_stylesheets(urls): return [await CssResponse.fetch(u) for u in urls] class Response(object): def __init__(self, html, stylesheets, capture_path): self.html = html self.stylesheets = stylesheets self.capture_path = capture_path class AsyncCrawler(object): def __init__(self, *, headless, options): self._headless = headless self._options = options async def __aenter__(self): # CORSを無効化して、別ドメインのCSSを参照できるようにする self._browser = await launch(headless=self._headless, args=['--disable-web-security']) # self._browser = await launch(headless=self._headless, ignoreHTTPSErrors=True) return self async def __aexit__(self, exc_type, exc, tb): await self._browser.close() return False async def open(self, url): return await CrawlerTab.open(self._browser, url, self._options) class CrawlerTab(object): def __init__(self, page, url): self._page = page self._url = url @classmethod async def open(cls, browser, url, options): page = await browser.newPage() if options.useragent: await page.setUserAgent(userAgent=options.useragent) if options.viewport: await page.setViewport(viewport=options.viewport) await page.goto(url) #TODO:: timeout= ? # TODO:: wait until loaded return cls(page, url) async def __aenter__(self): return self async def __aexit__(self, exc_type, exc, tb): await self._page.close() return False async def fetch(self): content = await self._page.evaluate('document.documentElement.innerHTML', force_expr=True) url = await self._page.evaluate('window.location.href', force_expr=True) return HTML(html=content, url=url) async def screenshot(self, path): path.parent.mkdir(exist_ok=True, parents=True) await self._page.screenshot(path=str(path), fullPage=True) return Path(path) async def fetchStyleSheets(self): return await self._page.evaluate("""() => { const result = []; for (const s of document.styleSheets) { if (s.href) result.push(s.href); } return result; }""") class CssResponse(object): def __init__(self, url, css): self.url = url self.text = css @classmethod async def fetch(cls, url): # TODO:: ブラウザから取得したい async with aiohttp.request('GET', url) as response: css = await response.text() return cls(url, css)PK!, nmqn/main.pyimport click from nmqn import crawl as crl from nmqn import compare as cmp DEFAULT_PATH = "~/.nmqn" # TODO:: ロギング @click.group() def cmd(): pass @cmd.command() @click.option('config', '-c', required=True, type=str, help='Config path.') @click.option('max_tab', '-m', default=10, type=int, help='Chrome Tab.') @click.option('path', '-p', default=DEFAULT_PATH, type=str, help='OutputPath') @click.option('--debug', is_flag=True, help='not headless mode') def crawl(config, max_tab, path, debug): crl.execute(config, max_tab, path, not debug) @cmd.command() @click.option('config', '-c', required=True, type=str, help='Config path.') @click.option('x', '-x', default=None, type=str, help='before path.') @click.option('y', '-y', default=None, type=str, help='after path.') @click.option('path', '-p', default=DEFAULT_PATH, type=str, help='Crawl path.') def compare(config, x, y, path): cmp.execute(config, x, y, path) def main(): cmd() if __name__ == "__main__": main()PK!H$'%nmqn-0.0.1.dist-info/entry_points.txtN+I/N.,()-̳zyV PK!HڽTUnmqn-0.0.1.dist-info/WHEEL A н#Z;/"d&F[xzw@Zpy3Fv]\fi4WZ^EgM_-]#0(q7PK!H4onmqn-0.0.1.dist-info/METADATA[O@SL‹&vwtFa%Cwz3o%d!K(d1A—9tc8Ӳd_Hvw d=>gY= ĦyS:! WlPM29n^p^0*cn'͜ N"༴'nꔻ*w]٣\o" sBx}+}2cZ̜TDntnCL0gd"!\ƫR*s쑯L;yTPꧬgo)V~ڰ5b[L|_^)<ʹעHU"-.Z ˞c}>Auu6R Jܐ89V:}Ep:xcaauՈlB U!3D? ;_Ca af`Gt(stGIJ'5!9>ZwBWf 8j1R 1x|Z, 0cޙ))F&IG\=Nv{!(5,77e=2_h[UVM码؞Eu5" d{ ,+K&ÜY3{MBJ/6}Rf^S+_o5N#y:HZ}6Ҷ֬/FtmrL՘PK!H9nmqn-0.0.1.dist-info/RECORDIH,`/9 M E ʧ89W̮tIRw5Jp*Xsn-]ubb떓mMQԯr jp O$شΚJqu iOg0NWubO-S]XI`~lA`÷eTP;Nj$#U/ikЃ}Q3V 4uG"U8D5xhvlUGlۇP{foL>;y7*aS]/; d!@e]} rQsSCE;,=>&R I!sJG*8b..9\mc #pe~.F.߶Cp]3w G gˈJ猑 %ږQrpK]MPen;O4KtUˣ>H~p_O_T{;kCTgN!`XSb6|1@9]6ߑ:U)Z53t 'sƱn+vh K=#pwfmZw/P679e.R'5 zЮ,!FĊXc="NG!xOиB7/3IJwj2떰!sP&&H@%F4 ND%[~3=j8wvn &AG $U 珻VqÞ L@[ G5z]c$EUOSaPK! qnmqn/__init__.pyPK!Dnmqn/_common/__init__.pyPK!D(L L znmqn/_common/config.pyPK!rYG nmqn/_common/path.pyPK!  nmqn/compare/__init__.pyPK!*nmqn/compare/_view.pyPK!$.nmqn/compare/templetes/report.tpl.mdPK!<(!1nmqn/compare/templetes/top.tpl.mdPK!ٶF2nmqn/crawl/__init__.pyPK!n<$$m7nmqn/crawl/_crawler.pyPK!=nmqn/lib/__init__.pyPK!ێ:=nmqn/lib/crawler.pyPK!, :Pnmqn/main.pyPK!H$'%OTnmqn-0.0.1.dist-info/entry_points.txtPK!HڽTUTnmqn-0.0.1.dist-info/WHEELPK!H4oBUnmqn-0.0.1.dist-info/METADATAPK!H9#Xnmqn-0.0.1.dist-info/RECORDPKU[