PK!us s download_arxiv.py# -*- coding: utf-8 -*- import os import sys import time import urllib from urllib.request import urlretrieve import arxiv import click from dateutil.parser import parse from logzero import logger from pybtex import database def reporthook(count, block_size, total_size): global start_time if count == 0: start_time = time.time() return duration = time.time() - start_time progress_size = int(count * block_size) speed = int(progress_size / (1024 * duration)) percent = int(count * block_size * 100 / total_size) sys.stdout.write( "\r{}%, {} KB, {} KB/s, {:.1f} seconds passed".format(min(percent, 100), progress_size / 1024, speed, duration)) sys.stdout.flush() def download_from_arxiv(url, dirpath='./'): if url.endswith('.pdf'): paper_id = os.path.splitext(os.path.basename(url))[0] else: paper_id = os.path.basename(url) paper = arxiv.query(id_list=[paper_id])[0] def custom_slugify(obj): author_last_name = obj['authors'][0].strip().split(' ')[-1] year = parse(obj['published']).year title = obj['title'].strip().replace('\n', '') logger.info('Download "{}" from "{}"'.format(title, obj['pdf_url'])) return '[{}+{}] {}'.format(author_last_name, year, title) if not paper.get('pdf_url', ''): print("Object has no PDF URL.") return if dirpath[-1] != '/': dirpath += '/' path = dirpath + custom_slugify(paper) + '.pdf' urlretrieve(paper['pdf_url'], path, reporthook=reporthook) return path def download_from_acl(url, dirpath='./'): bib_url = url.strip('\n').rstrip('/') + '.bib' bib = urllib.request.urlopen(bib_url).read().decode('utf-8') bib_database = database.parse_string(bib, bib_format='bibtex') author_lastname = bib_database.entries.values()[0].persons['author'][0].last()[0] year = bib_database.entries.values()[0].fields['year'].strip() title = bib_database.entries.values()[0].fields['title'].strip() out_name = '[{}+{}] {}.pdf'.format(author_lastname, year, title) path = os.path.join(dirpath, out_name) url = bib_database.entries.values()[0].fields['url'].strip() logger.info('Download "{}" from "{}"'.format(title, url)) urlretrieve(url, path, reporthook=reporthook) return path @click.command() @click.argument('urls', type=str, nargs=-1) @click.option( '-o', '--out', default=None, type=click.Path(), help='path to save pdf' ) def main(urls, out): if out is None: if 'ARXIV_OUT' in os.environ: out = os.environ['ARXIV_OUT'] else: out = '.' logger.info('Save PDF(s) to {}'.format(out)) for url in urls: if 'arxiv' in url: download_from_arxiv(url, dirpath=out) elif 'acl' in url: download_from_acl(url, dirpath=out) else: raise NotImplementedError PK!H=B+-/download_arxiv-0.1.5.dist-info/entry_points.txtN+I/N.,()J,,M/OLsr3PK!HڽTU$download_arxiv-0.1.5.dist-info/WHEEL A н#Z;/"d&F[xzw@Zpy3Fv]\fi4WZ^EgM_-]#0(q7PK!H|B'download_arxiv-0.1.5.dist-info/METADATAAS0ɥXxL C=^5V%zV1) 7i==]WJi49-T 94*ܓ~`D*;[U9;U5In_ 6% v]Η`kM@tͶJ7ARW===&SvҨW?r9:|*oIN65+Sj S\M>O+x;_/ӣTN8)RI}FH4)w 024v0i02El/^7QäزЕN[O'h</nzKçW &&{VChQx *^y\m3y4*jP`C PK!H-Ϟ%download_arxiv-0.1.5.dist-info/RECORD1v0ݳ@D`h Iyb$":y1׸UqM{,^멇v61mZ7C-k#OҠn趎eGATuIA$qrEdwoGN^nnn) X~ޓ[tcVє|ﺀikts7s8[: 66KC*OtkBq@LR/!'GV-v[gl`dPK!us s download_arxiv.pyPK!H=B+-/ download_arxiv-0.1.5.dist-info/entry_points.txtPK!HڽTU$ download_arxiv-0.1.5.dist-info/WHEELPK!H|B' download_arxiv-0.1.5.dist-info/METADATAPK!H-Ϟ%download_arxiv-0.1.5.dist-info/RECORDPK