PK! hWWcity_scrapers_core/__init__.py__version__ = "0.1.0" from .spiders import CityScrapersSpider, LegistarSpider # noqa PK!'city_scrapers_core/commands/__init__.pyPK!)+city_scrapers_core/commands/combinefeeds.pyimport json from datetime import datetime, timedelta from operator import itemgetter from urllib.parse import urlparse from scrapy.commands import ScrapyCommand from scrapy.exceptions import UsageError class Command(ScrapyCommand): requires_project = True def syntax(self): return "[options]" def short_desc(self): return "Combine all recent feeds into latest.json and upcoming.json" def run(self, args, opts): storages = self.settings.get("FEED_STORAGES", {}) if "s3" in storages: self.combine_s3() elif "azure" in storages: self.combine_azure() else: raise UsageError( "Either 's3' or 'azure' must be in FEED_STORAGES to combine past feeds" ) def combine_s3(self): import boto3 parsed = urlparse(self.settings.get("FEED_URI")) bucket = parsed.netloc feed_prefix = self.settings.get("CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d") client = boto3.client( "s3", aws_access_key_id=self.settings.get("AWS_ACCESS_KEY_ID"), aws_secret_access_key=self.settings.get("AWS_SECRET_ACCESS_KEY"), ) max_days_previous = 3 days_previous = 0 prefix_objects = [] while days_previous <= max_days_previous: prefix_objects = client.list_objects( Bucket=bucket, Prefix=(datetime.now() - timedelta(days=days_previous)).strftime( feed_prefix ), ).get("Contents", []) if len(prefix_objects) > 0: break days_previous += 1 spider_keys = self.get_spider_paths([obj["Key"] for obj in prefix_objects]) meetings = [] for key in spider_keys: feed_text = ( client.get_object(Bucket=bucket, Key=key) .get("Body") .read() .decode("utf-8") ) meetings.extend( [json.loads(line) for line in feed_text.split("\n") if line.strip()] ) meetings = sorted(meetings, key=itemgetter("start")) yesterday_iso = (datetime.now() - timedelta(days=1)).isoformat()[:19] upcoming = [meeting for meeting in meetings if meeting["start"] > yesterday_iso] client.put_object( Body=("\n".join([json.dumps(meeting) for meeting in meetings])).encode(), Bucket=bucket, CacheControl="no-cache", Key="latest.json", ) client.put_object( Body=("\n".join([json.dumps(meeting) for meeting in upcoming])).encode(), Bucket=bucket, CacheControl="no-cache", Key="upcoming.json", ) def combine_azure(self): from azure.storage.blob import BlockBlobService, ContentSettings feed_uri = self.settings.get("FEED_URI") feed_prefix = self.settings.get("CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d") account_name, account_key = feed_uri[8::].split("@")[0].split(":") container = feed_uri.split("@")[1].split("/")[0] blob_service = BlockBlobService( account_name=account_name, account_key=account_key ) max_days_previous = 3 days_previous = 0 prefix_blobs = [] while days_previous <= max_days_previous: prefix_blobs = [ blob for blob in blob_service.list_blobs( container, prefix=(datetime.now() - timedelta(days=days_previous)).strftime( feed_prefix ), ) ] if len(prefix_blobs) > 0: break days_previous += 1 spider_blob_names = self.get_spider_paths([blob.name for blob in prefix_blobs]) meetings = [] for blob_name in spider_blob_names: feed_text = blob_service.get_blob_to_text(container, blob_name) meetings.extend( [json.loads(line) for line in feed_text.content.split("\n") if line] ) meetings = sorted(meetings, key=itemgetter("start")) yesterday_iso = (datetime.now() - timedelta(days=1)).isoformat()[:19] upcoming = [meeting for meeting in meetings if meeting["start"] > yesterday_iso] blob_service.create_blob_from_text( container, "latest.json", "\n".join([json.dumps(meeting) for meeting in meetings]), content_settings=ContentSettings(cache_control="no-cache"), ) blob_service.create_blob_from_text( container, "upcoming.json", "\n".join([json.dumps(meeting) for meeting in upcoming]), content_settings=ContentSettings(cache_control="no-cache"), ) def get_spider_paths(self, path_list): """Get a list of the most recent scraper results for each spider""" spider_paths = [] for spider in self.crawler_process.spider_loader.list(): all_spider_paths = [p for p in path_list if spider in p] if len(all_spider_paths) > 0: spider_paths.append(sorted(all_spider_paths)[-1]) return spider_paths PK!ޢ`(city_scrapers_core/commands/genspider.pyimport json import shutil import string from datetime import datetime from importlib import import_module from os.path import abspath, dirname, join from urllib.parse import urlparse import requests from legistar.events import LegistarEventsScraper from scrapy.commands import ScrapyCommand from scrapy.exceptions import UsageError from scrapy.utils.template import render_templatefile USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36" # noqa class Command(ScrapyCommand): requires_project = False default_settings = {"LOG_ENABLED": False} def syntax(self): return " " def short_desc(self): return "Generate a new spider and test file for a City Scrapers project" def run(self, args, opts): if len(args) != 3: raise UsageError() name, agency, start_url = args[0:3] domain = urlparse(start_url).netloc spider_template = "spider.tmpl" test_template = "test.tmpl" if "legistar.com" in domain: proto = "https" if start_url.startswith("https") else "http" start_url = "{}://{}".format(proto, domain) spider_template = "spider_legistar.tmpl" test_template = "test_legistar.tmpl" fixture_file = self._gen_legistar_fixtures(name, start_url) else: fixture_file = self._gen_fixtures(name, start_url) classname = "{}Spider".format(string.capwords(name, sep="_").replace("_", "")) self._genspider(name, agency, classname, domain, start_url, spider_template) self._gen_tests(name, classname, fixture_file, test_template) def _genspider(self, name, agency, classname, domain, start_url, template_file): """Create spider from custom template""" template_dict = { "name": name, "agency": agency, "domain": domain, "start_url": start_url, "classname": "{}Spider".format( string.capwords(name, sep="_").replace("_", "") ), } spider_file = "{}.py".format(join(self.spiders_dir, name)) shutil.copyfile(join(self.templates_dir, template_file), spider_file) render_templatefile(spider_file, **template_dict) print("Created file: {}".format(spider_file)) def _gen_tests(self, name, classname, fixture_file, template_file): """Creates tests from test template file""" template_dict = { "name": name, "classname": classname, "fixture_file": fixture_file, "date_str": datetime.now().strftime("%Y-%m-%d"), } test_file = join(self.tests_dir, "test_{}.py".format(name)) shutil.copyfile(join(self.templates_dir, template_file), test_file) render_templatefile(test_file, **template_dict) print("Created file: {}".format(test_file)) def _gen_fixtures(self, name, start_url): """Creates fixures from HTML response at the start URL""" res = requests.get(start_url, headers={"user-agent": USER_AGENT}) content = res.text.strip() fixture_file = join(self.fixtures_dir, "{}.html".format(name)) with open(fixture_file, "w") as f: f.write(content) print("Created file: {}".format(fixture_file)) return "{}.html".format(name) def _gen_legistar_fixtures(self, name, start_url): """Creates fixtures from a Legistar response""" events = [] les = LegistarEventsScraper() les.BASE_URL = start_url les.EVENTSPAGE = "{}/Calendar.aspx".format(start_url) for event, _ in les.events(since=datetime.today().year): events.append((dict(event), None)) fixture_file = join(self.fixtures_dir, "{}.json".format(name)) with open(fixture_file, "w") as f: json.dump(events, f) print("Created file: {}".format(fixture_file)) return "{}.json".format(name) @property def spiders_dir(self): if self.settings.get("NEWSPIDER_MODULE"): spiders_module = import_module(self.settings["NEWSPIDER_MODULE"]) spiders_dir = abspath(dirname(spiders_module.__file__)) else: spiders_dir = "." return spiders_dir @property def templates_dir(self): return join(dirname(dirname(abspath(__file__))), "templates") @property def tests_dir(self): if self.spiders_dir == ".": return "." return join(dirname(dirname(self.spiders_dir)), "tests") @property def fixtures_dir(self): if self.tests_dir == ".": return "." return join(self.tests_dir, "files") PK!e]]#city_scrapers_core/commands/list.pyfrom scrapy.commands.list import Command as ExistingListCommand class Command(ExistingListCommand): def run(self, args, opts): for s in sorted(self.crawler_process.spider_loader.list()): cls = self.crawler_process.spider_loader.load(s) print("{0: <6} | {1}".format(s, getattr(cls, "agency", cls.agency_name))) PK! *%city_scrapers_core/commands/runall.pyfrom scrapy.commands import ScrapyCommand class Command(ScrapyCommand): requires_project = True def syntax(self): return "[options" def short_desc(self): return "Run all spiders in a project" def run(self, args, opts): for spider in self.crawler_process.spider_loader.list(): self.crawler_process.crawl(spider) self.crawler_process.start() PK!  'city_scrapers_core/commands/validate.pyimport os import re import subprocess from importlib import import_module from scrapy.commands import ScrapyCommand from scrapy.exceptions import UsageError from ..pipelines import ValidationPipeline class Command(ScrapyCommand): requires_project = True def syntax(self): return "[options] " def short_desc(self): return "Run a spider with validations, or validate all changed spiders in a PR" def add_options(self, parser): ScrapyCommand.add_options(self, parser) parser.add_option( "--all", dest="all", action="store_true", help="Run validation on all scrapers", ) def run(self, args, opts): self._add_validation_pipeline() in_ci = os.getenv("CI") if len(args) < 1 and not in_ci and not opts.all: raise UsageError( "At least one spider must be supplied or --all flag must be supplied " "if not in CI environment" ) if len(args) == 1: spiders = [args[0]] elif opts.all: spiders = self.crawler_process.spider_loader.list() elif in_ci: spiders = self._get_changed_spiders() if len(spiders) == 0: print("No spiders provided, exiting...") return for spider in spiders: self.crawler_process.crawl(spider) self.crawler_process.start() def _add_validation_pipeline(self): """Add validation pipeline to pipelines if not already present""" pipelines = self.settings.get("ITEM_PIPELINES", {}) pipeline_name = ValidationPipeline.__name__ # Exit if pipeline already included if any(pipeline_name in pipeline for pipeline in pipelines.keys()): return fullname = "{}.{}".format(ValidationPipeline.__module__, pipeline_name) priority = 1 if len(pipelines.keys()) > 0: priority = max(pipelines.values()) + 1 self.settings.set("ITEM_PIPELINES", {**pipelines, **{fullname: priority}}) self.settings.set("CITY_SCRAPERS_ENFORCE_VALIDATION", True) def _get_changed_spiders(self): """Checks git diff for spiders that have changed""" changed_spiders = [] travis_pr = os.getenv("TRAVIS_PULL_REQUEST") if not travis_pr or travis_pr == "false": print("Travis CI build not triggered by a pull request") return changed_spiders diff_command = "git diff --name-only --diff-filter=AM $TRAVIS_COMMIT_RANGE" diff_output = subprocess.check_output(diff_command.split()).decode("utf-8") for filename in diff_output.split("\n"): spider = re.search("(?<={}/)\w+(?=\.py)".format(self.spiders_dir), filename) if spider: changed_spiders.append(spider.group()) return changed_spiders @property def spiders_dir(self): spiders_module = import_module(self.settings.get("NEWSPIDER_MODULE")) return os.path.relpath(os.path.dirname(spiders_module.__file__)) PK!o$$city_scrapers_core/constants.pyNAMESPACE = "cityscrapers.org" ADVISORY_COMMITTEE = "Advisory Committee" BOARD = "Board" CITY_COUNCIL = "City Council" COMMISSION = "Commission" COMMITTEE = "Committee" FORUM = "Forum" POLICE_BEAT = "Police Beat" NOT_CLASSIFIED = "Not classified" CLASSIFICATIONS = ( ADVISORY_COMMITTEE, BOARD, CITY_COUNCIL, COMMISSION, COMMITTEE, FORUM, POLICE_BEAT, NOT_CLASSIFIED, ) CANCELLED = "cancelled" TENTATIVE = "tentative" CONFIRMED = "confirmed" PASSED = "passed" STATUSES = (CANCELLED, TENTATIVE, CONFIRMED, PASSED) PK!:*u city_scrapers_core/decorators.pyfrom functools import wraps def ignore_jscalendar(func): """Method decorator to ignore JSCalendar items passed to pipeline by middleware""" @wraps(func) def wrapper(*args, **kwargs): for i in range(2): if isinstance(args[i], dict) and "cityscrapers.org/id" in args[i]: return args[i] return func(*args, **kwargs) return wrapper PK!>)city_scrapers_core/extensions/__init__.pyfrom .azure_status import AzureBlobStatusExtension # noqa from .azure_storage import AzureBlobFeedStorage # noqa from .s3_status import S3StatusExtension # noqa PK!@-city_scrapers_core/extensions/azure_status.pyfrom .status import StatusExtension class AzureBlobStatusExtension(StatusExtension): def update_status_svg(self, spider, svg): from azure.storage.blob import BlockBlobService, ContentSettings blob_service = BlockBlobService( account_name=self.crawler.settings.get("AZURE_ACCOUNT_NAME"), account_key=self.crawler.settings.get("AZURE_ACCOUNT_KEY"), ) blob_service.create_blob_from_text( self.crawler.settings.get("CITY_SCRAPERS_STATUS_CONTAINER"), "{}.svg".format(spider.name), svg, content_settings=ContentSettings( content_type="image/svg+xml", cache_control="no-cache" ), ) PK!!B[((.city_scrapers_core/extensions/azure_storage.pyfrom scrapy.extensions.feedexport import BlockingFeedStorage class AzureBlobFeedStorage(BlockingFeedStorage): def __init__(self, uri): from azure.storage.blob import BlockBlobService container = uri.split("@")[1].split("/")[0] filename = "/".join(uri.split("@")[1].split("/")[1::]) account_name, account_key = uri[8::].split("@")[0].split(":") self.account_name = account_name self.account_key = account_key self.container = container self.filename = filename self.blob_service = BlockBlobService( account_name=self.account_name, account_key=self.account_key ) def _store_in_thread(self, file): file.seek(0) self.blob_service.create_blob_from_stream(self.container, self.filename, file) PK!9*city_scrapers_core/extensions/s3_status.pyfrom .status import StatusExtension class S3StatusExtension(StatusExtension): def update_status_svg(self, spider, svg): import boto3 s3_client = boto3.client( "s3", aws_access_key_id=self.crawler.settings.get("AWS_ACCESS_KEY_ID"), aws_secret_access_key=self.crawler.settings.get("AWS_SECRET_ACCESS_KEY"), ) s3_client.put_object( Body=svg.encode(), Bucket=self.crawler.settings.get("CITY_SCRAPERS_STATUS_BUCKET"), CacheControl="no-cache", ContentType="image/svg+xml", Key="{}.svg".format(spider.name), ) PK! 'city_scrapers_core/extensions/status.pyfrom datetime import datetime import pytz from scrapy import signals RUNNING = "running" FAILING = "failing" STATUS_COLOR_MAP = {RUNNING: "#44cc11", FAILING: "#cb2431"} STATUS_ICON = """ {status} {status} {date} {date} """ # noqa class StatusExtension: """ Scrapy extension for maintaining an SVG badge for each scraper's status. TODO: Track how many items are scraped on each run. """ def __init__(self, crawler): self.crawler = crawler self.has_error = False @classmethod def from_crawler(cls, crawler): ext = cls(crawler) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) crawler.signals.connect(ext.spider_error, signal=signals.spider_error) return ext def spider_closed(self): if self.has_error: return svg = self.create_status_svg(self.crawler.spider, RUNNING) self.update_status_svg(self.crawler.spider, svg) def spider_error(self): self.has_error = True svg = self.create_status_svg(self.crawler.spider, FAILING) self.update_status_svg(self.crawler.spider, svg) def create_status_svg(self, spider, status): tz = pytz.timezone(spider.timezone) return STATUS_ICON.format( color=STATUS_COLOR_MAP[status], status=status, date=tz.localize(datetime.now()).strftime("%Y-%m-%d"), ) def update_status_svg(self, spider, svg): raise NotImplementedError PK!5 city_scrapers_core/items.pyimport scrapy from .constants import CLASSIFICATIONS, STATUSES class Meeting(scrapy.Item): id = scrapy.Field() title = scrapy.Field() description = scrapy.Field() classification = scrapy.Field() status = scrapy.Field() start = scrapy.Field() end = scrapy.Field() all_day = scrapy.Field() time_notes = scrapy.Field() location = scrapy.Field() links = scrapy.Field() source = scrapy.Field() jsonschema = { "$schema": "http://json-schema.org/draft-07/schema#", "title": "Meeting Item", "type": "object", "definitions": { "location": { "type": "object", "properties": { "name": {"type": "string"}, "address": {"type": "string"}, }, }, "link": { "type": "object", "properties": { "href": {"type": "string", "format": "uri"}, "title": {"type": "string"}, }, "required": ["href"], }, }, "properties": { "id": { "type": "string", "description": "An ID based on the scraper slug, date and time of the meeting", # noqa }, "title": {"type": "string", "description": "The title of the meeting"}, "description": { "type": "string", "description": "A description of the specific meeting", }, "all_day": { "type": "boolean", "description": "Whether the meeting occurs for the entire day", }, "status": { "type": "string", "description": "The status of the meeting at the time it is scraped", "enum": list(STATUSES), }, "classification": { "type": "string", "description": "The type of meeting from the list of options", "enum": list(CLASSIFICATIONS), }, "start": { "type": "string", "description": "The datetime the meeting begins in local time in ISO 8601 format", # noqa "format": "date-time", }, "end": { "type": "string", "description": "The datetime the meeting ends in local time in ISO 8601 format", # noqa "format": "date-time", }, "time_notes": { "type": "string", "description": "Any additional notes about the meeting time", }, "location": { "type": "object", "description": "The location where the meeting occurs", }, "links": {"type": "array"}, "source": {"type": "string", "format": "url"}, }, "required": ["id", "title", "start", "source"], } PK!CAbb*city_scrapers_core/middlewares/__init__.pyfrom .azure_diff import AzureDiffMiddleware # noqa from .s3_diff import S3DiffMiddleware # noqa PK!At,city_scrapers_core/middlewares/azure_diff.pyimport json from datetime import datetime, timedelta from operator import attrgetter from pytz import timezone from .diff import DiffMiddleware class AzureDiffMiddleware(DiffMiddleware): """Azure Blob Storage backend for comparing previously scraped JSCalendar outputs""" def __init__(self, spider, settings): from azure.storage.blob import BlockBlobService self.spider = spider feed_uri = settings.get("FEED_URI") account_name, account_key = feed_uri[8::].split("@")[0].split(":") self.blob_service = BlockBlobService( account_name=account_name, account_key=account_key ) self.container = feed_uri.split("@")[1].split("/")[0] self.feed_prefix = settings.get("CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d") super().__init__() def load_previous_results(self): max_days_previous = 3 days_previous = 0 tz = timezone(self.spider.timezone) while days_previous <= max_days_previous: matching_blobs = self.blob_service.list_blobs( self.container, prefix=( tz.localize(datetime.now()) - timedelta(days=days_previous) ).strftime(self.feed_prefix), ) spider_blobs = [ blob for blob in matching_blobs if self.spider.name in blob.name ] if len(spider_blobs) > 0: break days_previous += 1 if len(spider_blobs) == 0: return [] blob = sorted(spider_blobs, key=attrgetter("name"))[-1] feed_text = self.blob_service.get_blob_to_text(self.container, blob.name) return [ json.loads(line) for line in feed_text.content.split("\n") if line.strip() ] PK!2&city_scrapers_core/middlewares/diff.pyfrom datetime import datetime from city_scrapers_core.constants import CANCELLED from city_scrapers_core.items import Meeting class DiffMiddleware: """ Class for comparing previous feed export results in JSCalendar format and either merging UIDs for consistency or marking upcoming meetings that no longer appear as cancelled. Provider-specific backends can be created by subclassing and implementing the `load_previous_results` method. """ def __init__(self, *args, **kwargs): self.previous_results = self.load_previous_results() self.previous_map = { result["cityscrapers.org/id"]: result["uid"] for result in self.previous_results } @classmethod def from_crawler(cls, crawler): return cls(crawler.spider, crawler.settings) def process_spider_output(self, response, result, spider): """ Merge the UIDs of previously scraped meetings and cancel any upcoming meetings that no longer appear in results """ scraped_ids = set() # Merge the previous UID into the item if it's already been scraped before for item in result: if isinstance(item, Meeting) or isinstance(item, dict): scraped_ids.add(item["id"]) if item["id"] in self.previous_map: # Bypass __setitem__ call on Meeting to add uid if isinstance(item, Meeting): item._values["uid"] = self.previous_map[item["id"]] else: item["uid"] = self.previous_map[item["id"]] yield item now_iso = datetime.now().isoformat()[:19] for item in self.previous_results: # Ignore items that are already included or are in the past if item["cityscrapers.org/id"] in scraped_ids or item["start"] < now_iso: continue # If the item is upcoming and outside the prior criteria, mark it cancelled scraped_ids.add(item["cityscrapers.org/id"]) yield {**item, "status": CANCELLED} def load_previous_results(self): """Return a list of dictionaries loaded from a previous feed export""" raise NotImplementedError PK!HB)city_scrapers_core/middlewares/s3_diff.pyimport json from datetime import datetime, timedelta from operator import itemgetter from urllib.parse import urlparse from pytz import timezone from .diff import DiffMiddleware class S3DiffMiddleware(DiffMiddleware): """S3 backend for comparing previously scraped JSCalendar outputs""" def __init__(self, spider, settings): import boto3 parsed = urlparse(settings.get("FEED_URI")) self.spider = spider self.feed_prefix = settings.get("CITY_SCRAPERS_DIFF_FEED_PREFIX", "%Y/%m/%d") self.bucket = parsed.netloc self.client = boto3.client( "s3", aws_access_key_id=settings.get("AWS_ACCESS_KEY_ID"), aws_secret_access_key=settings.get("AWS_SECRET_ACCESS_KEY"), ) super().__init__() def load_previous_results(self): max_days_previous = 3 days_previous = 0 tz = timezone(self.spider.timezone) while days_previous <= max_days_previous: match_objects = self.client.list_objects( Bucket=self.bucket, Prefix=( tz.localize(datetime.now()) - timedelta(days=days_previous) ).strftime(self.feed_prefix), MaxKeys=1000, ) spider_objects = [ obj for obj in match_objects.get("Contents", []) if self.spider.name in obj["Key"] ] if len(spider_objects) > 0: break days_previous += 1 if len(spider_objects) == 0: return [] obj = sorted(spider_objects, key=itemgetter("Key"))[-1] feed_text = ( self.client.get_object(Bucket=self.bucket, Key=obj["Key"]) .get("Body") .read() .decode("utf-8") ) return [json.loads(line) for line in feed_text.split("\n") if line.strip()] PK!Q(city_scrapers_core/pipelines/__init__.pyfrom .default import DefaultValuesPipeline # noqa from .jscalendar import JSCalendarPipeline # noqa from .meeting import MeetingPipeline # noqa from .validation import ValidationPipeline # noqa PK!!5LZZ'city_scrapers_core/pipelines/default.pyfrom city_scrapers_core.constants import NOT_CLASSIFIED, TENTATIVE from city_scrapers_core.decorators import ignore_jscalendar class DefaultValuesPipeline: """Sets default values for Meeting items""" @ignore_jscalendar def process_item(self, item, spider): item.setdefault("description", "") item.setdefault("all_day", False) item.setdefault("location", {}) item.setdefault("links", []) item.setdefaults("time_notes", "") item.setdefaults("classification", NOT_CLASSIFIED) item.setdefaults("status", TENTATIVE) return item PK!e0*city_scrapers_core/pipelines/jscalendar.pyfrom datetime import datetime from uuid import uuid4 from city_scrapers_core.decorators import ignore_jscalendar class JSCalendarPipeline: """ Pipeline for transforming Meeting items into JSCalendar format https://tools.ietf.org/html/draft-ietf-calext-jscalendar-11 """ @ignore_jscalendar def process_item(self, item, spider): return { "@type": "jsevent", "uid": item.get("uid") or str(uuid4()), "title": item["title"], "updated": datetime.now().isoformat()[:19], "description": item["description"], "isAllDay": item["all_day"], "status": item["status"], "start": item["start"].isoformat()[:19], "timeZone": spider.timezone, "duration": self.create_duration(item), "locations": {"location": item["location"]}, "links": self.create_links(item), "cityscrapers.org/id": item["id"], "cityscrapers.org/timeNotes": item["time_notes"], "cityscrapers.org/agency": spider.agency, "cityscrapers.org/classification": item["classification"], } def create_links(self, item): """Generate a mapping of link URLs and dictionaries for JSCalendar""" link_map = {link["href"]: link for link in item["links"]} link_map["cityscrapers.org/source"] = { "href": item["source"], "title": "Source", } return link_map def create_duration(self, item): """Create ISO 8601 duration string from start and end datetimes""" time_diff = item["end"] - item["start"] dur_str = "P" if time_diff.days > 0: dur_str += "{}D".format(time_diff.days) if time_diff.seconds > 0: dur_str += "T" if time_diff.seconds >= 3600: dur_str += "{}H".format(time_diff.seconds // 3600) if (time_diff.seconds // 60) % 60 > 0: dur_str += "{}M".format((time_diff.seconds // 60) % 60) return dur_str PK!Dځbb'city_scrapers_core/pipelines/meeting.pyfrom datetime import timedelta from city_scrapers_core.decorators import ignore_jscalendar class MeetingPipeline: @ignore_jscalendar def process_item(self, item, spider): item["title"] = spider._clean_title(item["title"]) if not item.get("end"): item["end"] = item["start"] + timedelta(hours=2) return item PK! % % *city_scrapers_core/pipelines/validation.pyimport re from collections import defaultdict from jsonschema.validators import Draft7Validator class ValidationPipeline: """ Check against schema if present, prints % valid for each property. Raises an exception for invalid results if CITY_SCRAPERS_ENFORCE_VALIDATION is set. """ @classmethod def from_crawler(cls, crawler): obj = cls() obj.enforce_validation = crawler.settings.getbool( "CITY_SCRAPERS_ENFORCE_VALIDATION" ) return obj def open_spider(self, spider): self.item_count = 0 self.error_count = defaultdict(int) def close_spider(self, spider): self.validation_report(spider) def process_item(self, item, spider): if not hasattr(item, "jsonschema"): return item item_dict = dict(item) item_dict["start"] = item_dict["start"].isoformat()[:19] item_dict["end"] = item_dict["end"].isoformat()[:19] validator = Draft7Validator(item.jsonschema) props = list(item.jsonschema["properties"].keys()) errors = list(validator.iter_errors(item_dict)) error_props = [self._get_prop_from_error(error) for error in errors] for prop in props: self.error_count[prop] += 1 if prop in error_props else 0 self.item_count += 1 return item def validation_report(self, spider): """Prints a validation report to stdout and raise an error if fails""" props = list(self.error_count.keys()) print( "\n{line}Validation summary for: {spider}{line}".format( line="-" * 12, spider=spider.name ) ) print("Validating {} items\n".format(self.item_count)) valid_list = [] for prop in props: valid = (self.item_count - self.error_count[prop]) / self.item_count print("{}: {:.0%}".format(prop, valid)) try: assert all([val >= 0.9 for val in valid_list]) except AssertionError: message = ( "Less than 90% of the scraped items from {} passed validation. See " "the validation summary printed in stdout, and check that the " "scraped items are valid according to the jsonschema property of " "the Meeting class." ).format(spider.name) if self.enforce_validation: raise Exception(message) else: print(message) def _get_prop_from_error(self, error): return re.search(r"(?<=')\w+(?=')").group() PK!OH\\&city_scrapers_core/spiders/__init__.pyfrom .legistar import LegistarSpider # noqa from .spider import CityScrapersSpider # noqa PK!e98ss&city_scrapers_core/spiders/legistar.pyfrom datetime import datetime import urllib3 from legistar.events import LegistarEventsScraper from .spider import CityScrapersSpider LINK_TYPES = ["Agenda", "Minutes", "Video", "Summary", "Captions"] class LegistarSpider(CityScrapersSpider): link_types = [] def parse(self, response): events = self._call_legistar() return self.parse_legistar(events) def _call_legistar(self, since=None): urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) les = LegistarEventsScraper() les.BASE_URL = self.base_url les.EVENTSPAGE = "{}/Calendar.aspx".format(self.base_url) if not since: since = datetime.today().year return les.events(since=since) def legistar_start(self, item): start_date = item.get("Meeting Date") start_time = item.get("Meeting Time") if start_date and start_time: return datetime.strptime( "{} {}".format(start_date, start_time), "%m/%d/%Y %I:%M %p" ) def legistar_links(self, item): links = [] for link_type in LINK_TYPES + self.link_types: if isinstance(item.get(link_type), dict) and item[link_type].get("url"): links.append({"href": item[link_type]["url"], "title": link_type}) return links def legistar_source(self, item): return item.get("Name", {}).get("url", "{}/Calendar.aspx".format(self.base_url)) @property def base_url(self): proto = "https" if self.start_urls[0].startswith("https://") else "http" return "{}://{}".format(proto, self.allowed_domains[0]) PK!8c{{$city_scrapers_core/spiders/spider.pyimport re from datetime import datetime from pytz import timezone from scrapy import Spider from ..constants import CANCELLED, PASSED, TENTATIVE class CityScrapersSpider(Spider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Add parameters for feed storage in spider local time if not hasattr(self, "timezone"): self.timezone = "America/Chicago" tz = timezone(self.timezone) now = tz.localize(datetime.now()) self.year = now.year self.month = now.strftime("%m") self.day = now.strftime("%d") self.hour_min = now.strftime("%H%M") def _clean_title(self, title): """Remove cancelled strings from title""" return re.sub( r"([\s:-]{1,3})?(cancel\w+|rescheduled)([\s:-]{1,3})?", "", title, flags=re.IGNORECASE, ).strip() def _get_id(self, item, identifier=None): """Create an ID based off of the meeting details, title and any identifiers""" underscore_title = re.sub( r"\s+", "_", re.sub(r"[^A-Z^a-z^0-9^]+", " ", self._clean_title(item["title"])), ).lower() item_id = (identifier or "x").replace("/", "-") start_str = item["start"].strftime("%Y%m%d%H%M") return "/".join([self.name, start_str, item_id, underscore_title]) def _get_status(self, item, text=""): """ Generates one of the allowed statuses from constants based on the title and time of the meeting """ meeting_text = " ".join( [item.get("title", ""), item.get("description", ""), text] ).lower() if any(word in meeting_text for word in ["cancel", "rescheduled", "postpone"]): return CANCELLED if item["start"] < datetime.now(): return PASSED return TENTATIVE PK!U^q q (city_scrapers_core/templates/spider.tmplfrom city_scrapers_core.constants import NOT_CLASSIFIED from city_scrapers_core.items import Meeting from city_scrapers_core.spiders import CityScrapersSpider class $classname(CityScrapersSpider): name = "$name" agency = "$agency" timezone = "America/Chicago" allowed_domains = ["$domain"] start_urls = ["$start_url"] def parse(self, response): """ `parse` should always `yield` a Meeting item. Change the `_parse_id`, `_parse_name`, etc methods to fit your scraping needs. """ for item in response.css(".meetings"): meeting = Meeting( title=self._parse_title(item), description=self._parse_description(item), classification=self._parse_classification(item), start=self._parse_start(item), end=self._parse_end(item), all_day=self._parse_all_day(item), time_notes=self._parse_time_notes(item), location=self._parse_location(item), links=self._parse_links(item), source=self._parse_source(response), ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting def _parse_title(self, item): """Parse or generate meeting title.""" return "" def _parse_description(self, item): """Parse or generate meeting description.""" return "" def _parse_classification(self, item): """Parse or generate classification from allowed options.""" return NOT_CLASSIFIED def _parse_start(self, item): """Parse start datetime as a naive datetime object.""" return None def _parse_end(self, item): """Parse end datetime as a naive datetime object. Added by pipeline if None""" return None def _parse_all_day(self, item): """Parse or generate all-day status. Defaults to False.""" return False def _parse_location(self, item): """Parse or generate location.""" return { "address": "", "name": "", } def _parse_links(self, item): """Parse or generate links.""" return [{"href": "", "title": ""}] def _parse_source(self, response): """Parse or generate source.""" return response.url PK!o\\1city_scrapers_core/templates/spider_legistar.tmplfrom city_scrapers_core.constants import NOT_CLASSIFIED from city_scrapers_core.items import Meeting from city_scrapers_core.spiders import LegistarSpider class $classname(LegistarSpider): name = "$name" agency = "$agency" timezone = "America/Chicago" allowed_domains = ["$domain"] start_urls = ["$start_url"] # Add the titles of any links not included in the scraped results link_types = [] def parse_legistar(self, events): for event, _ in events: meeting = Meeting( title=event["Name"]["label"], description=self._parse_description(event), classification=self._parse_classification(event), start=self.legistar_start(event), end=self._parse_end(event), all_day=self._parse_all_day(event), time_notes=self._parse_time_notes(event), location=self._parse_location(event), links=self.legistar_links(event), source=self.legistar_source(event), ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting def _parse_description(self, item): """Parse or generate meeting description.""" return "" def _parse_classification(self, item): """Parse or generate classification from allowed options.""" return NOT_CLASSIFIED def _parse_end(self, item): """Parse end datetime as a naive datetime object. Added by pipeline if None""" return None def _parse_all_day(self, item): """Parse or generate all-day status. Defaults to False.""" return False def _parse_location(self, item): """Parse or generate location.""" return { "address": "", "name": "", } PK!#q^&city_scrapers_core/templates/test.tmplimport pytest from freezegun import freeze_time from tests.utils import file_response from city_scrapers.spiders.$name import $classname test_response = file_response("files/$fixture_file") spider = $classname() freezer = freeze_time("$date_str") freezer.start() parsed_items = [item for item in spider.parse(test_response)] freezer.stop() def test_tests(): print("Please write some tests for this spider or at least disable this one.") assert False """ Uncomment below """ # def test_name(): # assert parsed_items[0]["name"] == "EXPECTED NAME" # def test_description(): # assert parsed_items[0]["event_description"] == "EXPECTED DESCRIPTION" # def test_start(): # assert parsed_items[0]["start"] == {"date": None, "time": None, "note": "EXPECTED DATE AND TIME"} # def test_end(): # assert parsed_items[0]["end"] == {"date": None, "time": None, "note": "EXPECTED DATE AND TIME"} # def test_id(): # assert parsed_items[0]["id"] == "EXPECTED ID" # def test_status(): # assert parsed_items[0]["status"] == "EXPECTED STATUS" # def test_location(): # assert parsed_items[0]["location"] == { # "neighborhood": "EXPECTED URL", # "name": "EXPECTED NAME", # "address": "EXPECTED ADDRESS" # } # def test_sources(): # assert parsed_items[0]["sources"] == [{ # "url": "EXPECTED URL", # "note": "EXPECTED NOTE" # }] # def test_documents(): # assert parsed_items[0]["documents"] == [{ # "url": "EXPECTED URL", # "note": "EXPECTED NOTE" # }] # @pytest.mark.parametrize("item", parsed_items) # def test_all_day(item): # assert item["all_day"] is False # @pytest.mark.parametrize("item", parsed_items) # def test_classification(item): # assert item["classification"] is None # @pytest.mark.parametrize("item", parsed_items) # def test__type(item): # assert parsed_items[0]["_type"] == "event" PK!o5y[/city_scrapers_core/templates/test_legistar.tmplimport json import pytest from freezegun import freeze_time from city_scrapers.spiders.$name import $classname freezer = freeze_time("$date_str") freezer.start() with open("files/$fixture_file", "r") as f: test_response = json.load(f) spider = $classname() parsed_items = [item for item in spider.parse_legistar(test_response)] freezer.stop() def test_tests(): print("Please write some tests for this spider or at least disable this one.") assert False """ Uncomment below """ # def test_name(): # assert parsed_items[0]["name"] == "EXPECTED NAME" # def test_description(): # assert parsed_items[0]["event_description"] == "EXPECTED DESCRIPTION" # def test_start(): # assert parsed_items[0]["start"] == {"date": None, "time": None, "note": "EXPECTED DATE AND TIME"} # def test_end(): # assert parsed_items[0]["end"] == {"date": None, "time": None, "note": "EXPECTED DATE AND TIME"} # def test_id(): # assert parsed_items[0]["id"] == "EXPECTED ID" # def test_status(): # assert parsed_items[0]["status"] == "EXPECTED STATUS" # def test_location(): # assert parsed_items[0]["location"] == { # "neighborhood": "EXPECTED URL", # "name": "EXPECTED NAME", # "address": "EXPECTED ADDRESS" # } # def test_sources(): # assert parsed_items[0]["sources"] == [{ # "url": "EXPECTED URL", # "note": "EXPECTED NOTE" # }] # def test_documents(): # assert parsed_items[0]["documents"] == [{ # "url": "EXPECTED URL", # "note": "EXPECTED NOTE" # }] # @pytest.mark.parametrize("item", parsed_items) # def test_all_day(item): # assert item["all_day"] is False # @pytest.mark.parametrize("item", parsed_items) # def test_classification(item): # assert item["classification"] is None # @pytest.mark.parametrize("item", parsed_items) # def test__type(item): # assert parsed_items[0]["_type"] == "event" PK!GK,,*city_scrapers_core-0.1.0.dist-info/LICENSEMIT License Copyright (c) 2019 City Bureau Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!HnHTU(city_scrapers_core-0.1.0.dist-info/WHEEL A н#Z;/"d&F[xzw@Zpy3Fv]\fi4WZ^EgM_-]#0(q7PK!HQ-+city_scrapers_core-0.1.0.dist-info/METADATATMs0W$0Wh 2C^:lĶTIu~}W#.O\8DcCh.2 ! ll\+-d"˄)CRE;ʊPcڨGeT X:mh^V\2vtϮ T&p0h63!p|0( gSSH6$W${ k^Wq1.a!ESA[!1rׁ|v(&NDPwoXYF`FDSq[a [tD+isB8pڇI[MzWϝI:AȧGT4  NUA1aBNgksA,R /ܼN61S 4CW$(EUވ3ʣ2m_qr^L7iS CU1oK2Es$]3Ky\{}[Z/!`>ESViSO<̀ay %a\qCB~ U#xG%N|Y*ba\_nV;He|np76@tbv&F&jp6lOm8KH&` Ƕ2vup9~->^'#9 ]<"l/Eg6 q5Hz\qNdSؕ-Q%ӟp2ш tG|!kw齊z " _Va5Eu/tN8,ݾ!v+{m( > ͞ ;垆Gz$N 1@ B@;A497XW(TM.GAVY^t^nKNYhGR QcGb"R/,ފC@7)dШ0٬AQ&N_PP$  6C^_e ڞ|łpŷu$x=شI!n":T8Cq EQ6X3ȞS–zY4 Eta 1^l4Z/^ѽV; P? uu՚?A 8 cőw3),&UI6YrdND7;@KJ'z7͉,Áj(z'㒛9rkhzF? ;dSwNTs_ԦR,8gsyo1hN}2I 7_Yq$vH]&#oȷdET^4񻄾(vK$>O<^ z>6U~*僄q)3OG0{E#>v3VGmo^ ]tYu7Em+IxiW`9$3ffi`mPK! hWWcity_scrapers_core/__init__.pyPK!'city_scrapers_core/commands/__init__.pyPK!)+city_scrapers_core/commands/combinefeeds.pyPK!ޢ`(city_scrapers_core/commands/genspider.pyPK!e]]#(city_scrapers_core/commands/list.pyPK! *%*city_scrapers_core/commands/runall.pyPK!  'Z,city_scrapers_core/commands/validate.pyPK!o$$8city_scrapers_core/constants.pyPK!:*u ;city_scrapers_core/decorators.pyPK!>)<city_scrapers_core/extensions/__init__.pyPK!@-=city_scrapers_core/extensions/azure_status.pyPK!!B[((.@city_scrapers_core/extensions/azure_storage.pyPK!9*`Dcity_scrapers_core/extensions/s3_status.pyPK! '.Gcity_scrapers_core/extensions/status.pyPK!5 5Qcity_scrapers_core/items.pyPK!CAbb*3]city_scrapers_core/middlewares/__init__.pyPK!At,]city_scrapers_core/middlewares/azure_diff.pyPK!2&/ecity_scrapers_core/middlewares/diff.pyPK!HB)Yncity_scrapers_core/middlewares/s3_diff.pyPK!Q(vcity_scrapers_core/pipelines/__init__.pyPK!!5LZZ'+wcity_scrapers_core/pipelines/default.pyPK!e0*ycity_scrapers_core/pipelines/jscalendar.pyPK!Dځbb' city_scrapers_core/pipelines/meeting.pyPK! % % *ǃcity_scrapers_core/pipelines/validation.pyPK!OH\\&4city_scrapers_core/spiders/__init__.pyPK!e98ss&Ԏcity_scrapers_core/spiders/legistar.pyPK!8c{{$city_scrapers_core/spiders/spider.pyPK!U^q q (Hcity_scrapers_core/templates/spider.tmplPK!o\\1city_scrapers_core/templates/spider_legistar.tmplPK!#q^&city_scrapers_core/templates/test.tmplPK!o5y[/vcity_scrapers_core/templates/test_legistar.tmplPK!GK,,*Rcity_scrapers_core-0.1.0.dist-info/LICENSEPK!HnHTU(city_scrapers_core-0.1.0.dist-info/WHEELPK!HQ-+`city_scrapers_core-0.1.0.dist-info/METADATAPK!Hd`i )city_scrapers_core-0.1.0.dist-info/RECORDPK##