PKNH^Vdicio/__init__.py""" Python API não oficial para Dicio.com.br @autor Felipe Pontes @email felipemfpontes@gmail.com """ from dicio.dicio import Dicio from dicio.dicio import Word from dicio.utils import Utils PKٌNH bbdicio/utils.py# coding: utf-8 class Utils(object): @staticmethod def remove_tags(str): """ Return a new string without html tags. >>> remove_tags("Something") 'Something' """ import re return re.sub('<[^>]*>', ' ', str).strip() @staticmethod def text_before(str, after): """ Return text before after. >>> text_before("Something", "") "Something" """ index = str.find(after) if index > -1: return str[:index] return str @staticmethod def text_after(str, before): """ Return text after before. >>> text_after("Something", "") 'Something' """ index = str.find(before) if index > -1: index += len(before) return str[index:] return str @staticmethod def text_between(str, before, after, force_html = False): """ Return text between before and after. Use force_html when before and after were html tags. >>> text_between("Something", "", "") 'Something' """ start = str.find(before) if start > -1: start += len(before) if force_html: if before[-1] != ">": start = str.find(">", start) + 1 end = str.find(after, start) if force_html: if after[0] != "<": end = str.find("<", start) if -1 < start < end: return str[start:end] return str @staticmethod def remove_spaces(str): """ Return a new string without double space, tabs, carriage return or line feed. >>> remove_spaces("Something else") 'Something else' """ str = str.replace("\t", " ") str = str.replace("\n", " ") str = str.replace("\r", " ") while str.find(" ") > -1: str = str.replace(" ", " ") return str.strip() @staticmethod def remove_accents(str): """ Return a new string without accents from portuguese >>> remove_accents("trava-língua") 'trava-lingua' """ encode = ["á", "à", "â", "ã", "ä", "é", "è", "ê", "ë", "í", "ì", "î", "ï", "ó", "ò", "ô", "õ", "ö", "ú", "ù", "û", "ü", "ç"] decode = ["a", "a", "a", "a", "a", "e", "e", "e", "e", "i", "i", "i", "i", "o", "o", "o", "o", "o", "u", "u", "u", "u", "c"] out = "" found = False for chr in str: for x, vgl in enumerate(encode): if chr == vgl: out += decode[x] found = True break else: found = False if not found: out += chr return out @staticmethod def split_html_tag(str, tag): """ Return a list like split, but it uses html tags in various formats. >>> str = "Something
else
and
another

thing" >>> split_html_tag(str, "br") ['Something', 'else', 'and', 'another', 'thing'] """ TEMPLATE = "<{0} />" templates = ["<{0}>", "<{0}>", "<{0} >", "<{0} >", "<{0}>", "<{0}/>", "<{0} >"] new_str = str for template in templates: new_str = new_str.replace(template.format(tag), TEMPLATE.format(tag)) return list(filter(None, new_str.split(TEMPLATE.format(tag)))) PKNH=MIo dicio/dicio.py# coding: utf-8 """ Unofficial Python API for Dicio.com.br @author Felipe Pontes @email felipemfpontes@gmail.com """ import html from urllib import request from dicio.utils import Utils BASE_URL = 'http://www.dicio.com.br/{}' CHARSET = 'iso-8859-1' TAG_MEANING = ('id="significado"', '

') TAG_SYNONYMS = ('class="adicional sinonimos"', '

') TAG_SYNONYMS_DELIMITER = ('') TAG_ENCHANT = ('id="enchant"', '') TAG_EXTRA = ('class="adicional"', '

') TAG_EXTRA_SEP = 'br' TAG_EXTRA_DELIMITER = ('', '') class Word(object): def __init__(self, word, meaning=None, synonyms=[], extra={}): self.word = word.strip().lower() self.url = BASE_URL.format(Utils.remove_accents(word).strip().lower()) self.meaning = meaning self.synonyms = synonyms self.extra = extra def __repr__(self): return self.word def load(self): found = Dicio().search(self.word) self.meaning = found.meaning self.synonyms = found.synonyms self.extra = found.extra class Dicio(object): """ Dicio API with meaning, synonyms and extra information. """ def search(self, word): """ Search for word. """ if len(word.split()) > 1: return None _word = Utils.remove_accents(word).strip().lower() try: url = request.urlopen(BASE_URL.format(_word)) except: return None page = html.unescape(url.read().decode(CHARSET)) if page.find(TAG_ENCHANT[0]) > -1: return None found = Word(word) found.meaning = self.meaning(page) found.synonyms = self.synonyms(page) found.extra = self.extra(page) return found def meaning(self, page): """ Return meaning. """ return Utils.remove_spaces(Utils.remove_tags(Utils.text_between(page, TAG_MEANING[0], TAG_MEANING[1], True))) def synonyms(self, page): """ Return list of synonyms. """ synonyms = [] if page.find(TAG_SYNONYMS[0]) > -1: synonyms_html = Utils.text_between(page, TAG_SYNONYMS[0], TAG_SYNONYMS[1], True) while synonyms_html.find(TAG_SYNONYMS_DELIMITER[0]) > -1: synonym = Utils.text_between(synonyms_html, TAG_SYNONYMS_DELIMITER[0], TAG_SYNONYMS_DELIMITER[1], True) synonyms.append(Word(Utils.remove_spaces(synonym))) synonyms_html = synonyms_html.replace(TAG_SYNONYMS_DELIMITER[0], "", 1) synonyms_html = synonyms_html.replace(TAG_SYNONYMS_DELIMITER[1], "", 1) return synonyms def extra(self, page): """ Return a dictionary of extra information. """ dic_extra = {} try: if page.find(TAG_EXTRA[0]) > -1: extra_html = Utils.text_between(page, TAG_EXTRA[0], TAG_EXTRA[1], True) extra_rows = Utils.split_html_tag(Utils.remove_spaces(extra_html), TAG_EXTRA_SEP) for row in extra_rows: _row = Utils.remove_tags(row) key, value = _row.split(":") dic_extra[Utils.remove_spaces(key)] = Utils.remove_spaces(value) except: pass return dic_extra PKRLHtests/__init__.pyPKNHtY Y tests/tests.py# coding: utf-8 import unittest from dicio import Dicio from dicio import Word from dicio import Utils class TestUtils(unittest.TestCase): s = 'Something' def test_remove_tags(self): expected = 'Something' result = Utils.remove_tags(self.s) self.assertEqual(expected, result) def test_text_before(self): expected = 'Something' result = Utils.text_before(self.s, '') self.assertEqual(expected, result) def test_text_after(self): expected = 'Something' result = Utils.text_after(self.s, '') self.assertEqual(expected, result) def test_text_between(self): expected = 'Something' result = Utils.text_between(self.s, '', '') result_force_html = Utils.text_between(self.s, '