PK!Ypycnpj_crawler/__init__.py__version__ = '0.3.0' PK! ]pycnpj_crawler/crawler.pyfrom pycnpj_crawler.states.util import load_state_class from pycpfcnpj import cpfcnpj def get_cnpj_data(cnpj, state="ba"): """ Busca no site do estado os dados do CNPJ provido Args: cnpj (str): CNPJ, somente numeros, da empresa alvo state (str): Sigla do estado, em letras minusculas Returns: cnpj (obj) """ valid = cpfcnpj.validate(cnpj) if (not valid): raise Exception("CNPJ inválido") state = load_state_class(state) return state().get_cnpj_data(cnpj) PK!!pycnpj_crawler/states/__init__.pyPK!GB pycnpj_crawler/states/ba.pyfrom requests_html import HTMLSession from .crawling import wait_random_delay, get_random_user_agent import unidecode class Bahia: URL_BASE = "http://www.sefaz.ba.gov.br/scripts/cadastro/cadastroBa/consultaBa.asp" POST_URL = "http://www.sefaz.ba.gov.br/scripts/cadastro/cadastroBa/result.asp" selectors = { "cnpj": "#Table5 > tr > td > p:nth-child(1) > table > tr:nth-child(3) > td:nth-child(1)", "incricao_estadual": "#Table5 > tr > td > p:nth-child(1) > table > tr:nth-child(3) > td:nth-child(2)", "razao_social": "#Table5 > tr > td > p:nth-child(1) > table > tr:nth-child(4) > td:nth-child(1)", "nome_fantasia": "#Table5 > tr > td > p:nth-child(1) > table > tr:nth-child(5) > td:nth-child(1)" } def _get_cnpj_raw_data(self, cnpj): session = HTMLSession() session.get(self.URL_BASE) payload = { "sefp": 1, "estado": "BA", "CGC": cnpj, "B1": "CNPJ++-%3E", "IE": "" } wait_random_delay() return session.post( self.POST_URL, data=payload, headers={ 'User-Agent': get_random_user_agent() }) def get_cnpj_data(self, cnpj): html = self._get_cnpj_raw_data(cnpj).html def get_value(raw_value): no_special_char = raw_value.text.replace("\xa0", " ") value = no_special_char.split(":")[1].strip() return value def get_key_value_pair(raw_value): no_special_char = raw_value.replace("\xa0", " ") key_value = no_special_char.split(":") return (turn_to_key(key_value[0].strip()), key_value[1].strip()) def turn_to_key(field_name): lower = field_name.lower() lower = lower.replace(" ", "_") lower = lower.replace("/", "_") lower = lower.replace("-", "") lower = unidecode.unidecode(lower) return lower def get_company_data_section(): obj = dict() tds = html.find("#Table6")[0].text.split("\n")[2:] for td in tds: k, v = get_key_value_pair(td) obj[k] = v return obj def get_address_data_section(): obj = dict() tds = html.find("#Table6")[1].text.split("\n")[2:] for td in tds: k, v = get_key_value_pair(td) obj[k] = v return obj def get_activities_data_section(): atividades = html.find("#Table7")[0].text.split("\n")[2:] atividade_principal = atividades[1].split("-") atividade = { "principal": { "id": atividade_principal[0], "descricao": atividade_principal[1] } } return atividade try: result = { **get_company_data_section(), "endereco": get_address_data_section(), "atividades": get_activities_data_section() } return result except Exception: raise Exception("CNPJ não encontrado") PK!Y((!pycnpj_crawler/states/crawling.pyimport random import time user_agent_list = [ #Chrome 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', #Firefox 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)', #Android Mobile User Agents 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G960F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36', 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/60.0.3112.107 Mobile Safari/537.36', 'Mozilla/5.0 (Linux; Android 6.0; HTC One X10 Build/MRA58K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/61.0.3163.98 Mobile Safari/537.36', #iPhone User Agents 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/69.0.3497.105 Mobile/15E148 Safari/605.1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/13.2b11866 Mobile/16A366 Safari/605.1.15', 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A5370a Safari/604.1', 'Mozilla/5.0 (iPhone9,3; U; CPU iPhone OS 10_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1', 'Mozilla/5.0 (Apple-iPhone7C2/1202.466; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543 Safari/419.3' ] def get_random_user_agent(): return random.choice(user_agent_list) def wait_random_delay(): time.sleep(random.randrange(2, 10)) PK! **pycnpj_crawler/states/util.pyBA = "ba" _states_mapping = {} _states_mapping[BA] = "Bahia" def get_state_name(state): return _states_mapping[state] def load_state_class(state): module = __import__(f"{state}", globals(), locals(), [], 1) state_name = get_state_name(state) return getattr(module, state_name) PK!HڽTU$pycnpj_crawler-0.1.0.dist-info/WHEEL A н#Z;/"d&F[xzw@Zpy3Fv]\fi4WZ^EgM_-]#0(q7PK!H n'pycnpj_crawler-0.1.0.dist-info/METADATA_K0)ЄtS|Pϗ4It*OIN~b"A;+錗S>z<'c{g }P?dCMs^-Sk 34{Iᦝ4!OvcsrEuc{A4fwcmlr&*~n|Nxle8LЋղE57C \ࢨY'T8~PK!Hn %pycnpj_crawler-0.1.0.dist-info/RECORDIsP}~ |" ȠAP/;eLSY:ιx q  vBrVfEhw6 j ul=S'oSWyBCi$4wdH6u'X |6ja 6NxMGdd v6tҜ}b۾{=+b#=D]uZo`0-AͿye'B-Ecܞ*չPhF fI~ej*D;+{  nיx"rqw]\ ɒ %rA-.iITGMU7vUgRZ6"%!I+0FoUWTďqw* 7'Kj)CsUcfT&tޝ6wTrxPK!Ypycnpj_crawler/__init__.pyPK! ]Npycnpj_crawler/crawler.pyPK!!pycnpj_crawler/states/__init__.pyPK!GB pycnpj_crawler/states/ba.pyPK!Y((!pycnpj_crawler/states/crawling.pyPK! ** pycnpj_crawler/states/util.pyPK!HڽTU$p!pycnpj_crawler-0.1.0.dist-info/WHEELPK!H n'"pycnpj_crawler-0.1.0.dist-info/METADATAPK!Hn %D#pycnpj_crawler-0.1.0.dist-info/RECORDPK 9%