#!/usr/bin/env python3
import argparse
import logging
import os
import io
import subprocess
import multiprocessing


class OCR():
	def __init__(self, filename, out, multicore, loglevel):
		self.initialised = False

		if loglevel != "NONE":
			logging.basicConfig(filename="pdf_extractor.log", level=getattr(logging, loglevel.upper()))

		self.filename = filename
		self.out = out
		self.multicore = multicore

		if self.out == "file":
			self.output = os.path.splitext(filename)[0] + '.txt'

		self.info = {}
		self.text = []

		self.initialised = True

	def start(self):
		if not self.initialised:
			logging.error("ERROR: OCR not initialised, cannot run")
			return

		self.load_pdf_info()
		logging.info("Pages: %d", self.info['pages'])

		self.run_ocr()
		self.finalise()

	def load_pdf_info(self):
		logging.info("load_pdf '%s'", self.filename)

		pdfinfo = subprocess.check_output(["pdfinfo", self.filename]).decode('utf-8')

		for line in pdfinfo.splitlines():
			row = line.split(":")
			self.info[row[0].lower()] = row[1].strip()

		self.info['tagged'] = False if self.info['tagged'] == "no" else True
		self.info['pages'] = int(self.info['pages'])
		self.info['encrypted'] = False if self.info['encrypted'] == "no" else True
		self.info['file size'] = int(self.info['file size'].split(' ')[0])
		self.info['optimized'] = False if self.info['optimized'] == "no" else True

	def run_ocr(self):
		logging.info("run_ocr")

		if self.multicore:
			count = multiprocessing.cpu_count()
			logging.info("Using multicore on %d cores", count)

			values = []

			for page in range(1, self.info['pages'] + 1):
				values.append({'filename': self.filename, 'page': page})

			with multiprocessing.Pool(processes=count) as pool:
				self.text = pool.map(ocr_sub, values)

		else:
			logging.info("Using single core")
			try:
				for page in range(1, self.info['pages'] + 1):
					logging.info("Page %d", page)
					print(subprocess.check_output(["pwd"]))
					self.text.append(subprocess.check_output(['page_worker', self.filename, str(page)]).decode('utf-8'))

			except FileNotFoundError as e:
				logging.error(e)

			except PermissionError as e:
				print(e)
				print("You may need to chmod +x 'page_worker'")

	def finalise(self):
		logging.info("finalising, text length: %d", len(self.text))
		if self.out == "stdout":
			for line in self.text:
				print(line)

		elif self.out == "file":
			with io.open(self.output, 'w') as f:
				for line in self.text:
					f.write(line)
		else:
			logging.error("Unknown out format %s", self.out)


def ocr_sub(args):
	logging.info("ocr_sub '%s':%d", args['filename'], args['page'])
	result = ""

	try:
		result = subprocess.check_output(['page_worker', args['filename'], str(args['page'])]).decode('utf-8')

	except subprocess.CalledProcessError as e:
		logging.error(e)

	except PermissionError as e:
		print(e)
		print("You may need to chmod +x 'page_worker'")

	return result


def main():
	parser = argparse.ArgumentParser(description='Run OCR on PDF files.')
	parser.add_argument('pdf', type=str, help='PDF file to process')
	parser.add_argument('out', type=str, help='stdout|file')
	parser.add_argument('-m', '--multicore', action='store_true', help='PDF file to process')
	parser.add_argument("-l", "--loglevel", default="NONE", help="set logging level")
	args = parser.parse_args()

	if args.out not in ['stdout', 'file']:
		print("'out' argument must be either 'stdout' or 'file'")
		return

	p = OCR(args.pdf, args.out, args.multicore, args.loglevel)
	p.start()


if __name__ == '__main__':
	main()
