PK DHtextacy/__init__.pyfrom __future__ import absolute_import import logging from textacy.preprocess import preprocess_text from textacy.texts import TextDoc, TextCorpus from textacy import data __all__ = [ 'preprocess_text', 'TextDoc', 'TextCorpus' ] __version__ = '0.1.1' logger = logging.getLogger('textacy') if len(logger.handlers) == 0: # To ensure reload() doesn't add another one logger.addHandler(logging.NullHandler()) PKGDH>textacy/data.py""" Functions to load and cache language data and other NLP resources. """ from __future__ import absolute_import, division, print_function, unicode_literals import logging import json import os import pyphen from cachetools import cached, Cache, hashkey from functools import partial from pandas import read_csv logger = logging.getLogger(__name__) DEFAULT_DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources') _CACHE = {} """dict: key-value store used to cache datasets and such in memory""" # TODO: maybe don't actually cache this -- it takes up a lot of RAM # but is indeed a pain to load @cached(Cache(1), key=partial(hashkey, 'spacy_pipeline')) def load_spacy_pipeline(lang='en', **kwargs): """ Load a language-specific pipeline (collection of data, models, and resources) via Spacy for tokenizing, tagging, parsing, etc. raw text. Args: lang (str {'en'}, optional): standard 2-letter language abbreviation **kwargs: keyword arguments to pass to Spacy pipeline instantiation; see `Spacy's documentation `_ Returns: :class:`spacy..` Raises: ValueError: if `lang` not equal to 'en' (more languages coming?!?) """ logger.info('Loading "%s" language Spacy pipeline', lang) if lang == 'en': from spacy.en import English return English(**kwargs) # TODO: uncomment these whenever spacy makes them available... # elif lang == 'de': # from spacy.de import German # return German(**kwargs) # elif lang == 'it': # from spacy.it import Italian # return Italian(**kwargs) # elif lang == 'fi': # from spacy.fi import Finnish # return Finnish(**kwargs) else: msg = 'spacy does not currently support lang "{}"'.format(lang) raise ValueError(msg) @cached(_CACHE, key=partial(hashkey, 'hyphenator')) def load_hyphenator(lang='en'): """ Load an object that hyphenates words at valid points, as used in LaTex typesetting. Note that while hyphenation points always fall on syllable divisions, not all syllable divisions are valid hyphenation points. But it's decent. Args: lang (str, optional): standard 2-letter language abbreviation; to get list of valid values:: >>> import pyphen; pyphen.LANGUAGES Returns: :class:`pyphen.Pyphen()` """ logger.info('Loading "%s" language hyphenator', lang) return pyphen.Pyphen(lang=lang) @cached(_CACHE, key=partial(hashkey, 'depechemood')) def load_depechemood(data_dir=None, weighting='normfreq'): """ Load DepecheMood lexicon text file from disk, munge into nested dictionary for convenient lookup by lemma#POS. NB: English only! Each version of DepecheMood is built starting from word-by-document matrices either using raw frequencies (DepecheMood_freq.txt), normalized frequencies (DepecheMood_normfreq.txt) or tf-idf (DepecheMood_tfidf.txt). The files are tab-separated; each row contains one Lemma#PoS followed by the scores for the following emotions: AFRAID, AMUSED, ANGRY, ANNOYED, DONT_CARE, HAPPY, INSPIRED, SAD. Args: data_dir (str, optional): directory on disk where DepecheMood lexicon text files are stored weighting (str {'freq', 'normfreq', 'tfidf'}, optional): type of word weighting used in building DepecheMood matrix Returns: dict[dict]: top-level keys are Lemma#POS strings, values are nested dicts with emotion names as keys and weights as floats References: Staiano, J., & Guerini, M. (2014). "DepecheMood: a Lexicon for Emotion Analysis from Crowd-Annotated News". Proceedings of ACL-2014. (arXiv:1405.1605) Data available at https://github.com/marcoguerini/DepecheMood/releases . """ if data_dir is None: data_dir = os.path.join(DEFAULT_DATA_DIR, 'DepecheMood_V1.0') fname = os.path.join(data_dir, 'DepecheMood_' + weighting + '.txt') # let's make sure this file exists... _ = os.path.isfile(fname) logger.info('Loading DepecheMood lexicon from %s', fname) return json.loads(read_csv( fname, sep='\t', index_col='Lemma#PoS').to_json(orient='index')) PKDH[Mnntextacy/extract.py# -*- coding: utf-8 -*- """ Functions to extract various elements of interest from documents already parsed by spacy (http://spacy.io/), such as n-grams, named entities, subject-verb-object triples, and acronyms. """ from __future__ import absolute_import, division, print_function, unicode_literals import re from collections import defaultdict from cytoolz import itertoolz from itertools import takewhile from numpy import nanmin, nanmax, zeros, NaN from operator import itemgetter from spacy.parts_of_speech import CONJ, DET, NOUN, VERB from textacy import spacy_utils, text_utils from textacy.spacy_utils import (normalized_str, get_main_verbs_of_sent, get_subjects_of_verb, get_objects_of_verb, get_span_for_compound_noun, get_span_for_verb_auxiliaries) from textacy.regexes_etc import NUMERIC_NE_TYPES, REPORTING_VERBS def words(doc, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags=None, bad_pos_tags=None, min_freq=1): """ Extract an ordered list of words from a spacy-parsed doc, optionally filtering words by part-of-speech (etc.) and frequency. Args: doc (:class:`spacy.Doc()` or :class:`spacy.Span()`) filter_stops (bool, optional): if True, remove stop words from word list filter_punct (bool, optional): if True, remove punctuation from word list filter_nums (bool, optional): if True, remove number-like words (e.g. 10, 'ten') from word list good_pos_tags (set[str], optional): remove words whose part-of-speech tag is NOT in the specified tags, using the set of universal POS tagset bad_pos_tags (set[str], optional): remove words whose part-of-speech tag IS in the specified tags, using the set of universal POS tagset min_freq (int, optional): remove words that occur in `doc` fewer than `min_freq` times Returns: list[:class:`spacy.Token()`] """ words_ = (w for w in doc) if filter_stops is True: words_ = (w for w in words_ if not w.is_stop) if filter_punct is True: words_ = (w for w in words_ if not w.is_punct) if filter_nums is True: words_ = (w for w in words_ if not w.like_num) if good_pos_tags: words_ = (w for w in words_ if w.pos_ in good_pos_tags) if bad_pos_tags: words_ = (w for w in words_ if w.pos_ not in bad_pos_tags) if min_freq > 1: words_ = list(words_) freqs = itertoolz.frequencies(normalized_str(w) for w in words_) words_ = (w for w in words_ if freqs[normalized_str(w)] >= min_freq) return list(words_) def ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags=None, bad_pos_tags=None, min_freq=1): """ Extract an ordered list of n-grams (`n` consecutive words) from a spacy-parsed doc, optionally filtering n-grams by the types and parts-of-speech of the constituent words. Args: doc (:class:`spacy.Doc()` or :class:`spacy.Span()`) n (int): number of tokens per n-gram; 2 gives bigrams, 3 gives trigrams, etc. filter_stops (bool, optional): if True, remove ngrams that start or end with a stop word filter_punct (bool, optional): if True, remove ngrams that contain any punctuation-only tokens filter_nums (bool, optional): if True, remove ngrams that contain any numbers or number-like tokens (e.g. 10, 'ten') good_pos_tags (set[str], optional): remove ngrams whose constituent tokens' part-of-speech tags are NOT all in the specified tags, using the universal POS tagset bad_pos_tags (set[str], optional): remove ngrams if any of their constituent tokens' part-of-speech tags are in the specified tags, using the universal POS tagset min_freq (int, optional): remove ngrams that occur in `doc` fewer than `min_freq` times Returns: list[:class:`spacy.Span()`] Raises: ValueError: if `n` < 1 """ if n < 1: raise ValueError('`n` must be greater than or equal to 1.') ngrams_ = (doc[i: i + n] for i in range(len(doc) - n + 1)) if filter_stops is True: ngrams_ = (ngram for ngram in ngrams_ if not ngram[0].is_stop and not ngram[-1].is_stop) if filter_punct is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_punct for w in ngram)) if filter_nums is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.like_num for w in ngram)) if good_pos_tags: ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ in good_pos_tags for w in ngram)) if bad_pos_tags: ngrams_ = (ngram for ngram in ngrams_ if not any(w.pos_ in bad_pos_tags for w in ngram)) if min_freq > 1: ngrams_ = list(ngrams_) freqs = itertoolz.frequencies(normalized_str(ngram) for ngram in ngrams_) ngrams_ = (ngram for ngram in ngrams_ if freqs[normalized_str(ngram)] >= min_freq) return list(ngrams_) def named_entities(doc, good_ne_types=None, bad_ne_types=None, min_freq=1, drop_determiners=True): """ Extract an ordered list of named entities (PERSON, ORG, LOC, etc.) from a spacy-parsed doc, optionally filtering by the entity types and frequencies. Args: doc (:class:`spacy.Doc()`) good_ne_types (set[str] or 'numeric', optional): named entity types to include; if "numeric", all numeric entity types are included bad_ne_types (set[str] or 'numeric', optional): named entity types to exclude; if "numeric", all numeric entity types are excluded min_freq (int, optional): remove named entities that occur in `doc` fewer than `min_freq` times drop_determiners (bool, optional): remove leading determiners (e.g. "the") from named entities (e.g. "the United States" => "United States") Returns: list[:class:`spacy.Span()`] """ nes = doc.ents if good_ne_types: if good_ne_types == 'numeric': good_ne_types = NUMERIC_NE_TYPES nes = (ne for ne in nes if ne.label_ in good_ne_types) if bad_ne_types: if bad_ne_types == 'numeric': bad_ne_types = NUMERIC_NE_TYPES nes = (ne for ne in nes if ne.label_ not in bad_ne_types) if drop_determiners is True: nes = (ne if ne[0].pos != DET else ne[1:] for ne in nes) if min_freq > 1: nes = list(nes) freqs = itertoolz.frequencies(ne.text for ne in nes) nes = (ne for ne in nes if freqs[ne.text] >= min_freq) return list(nes) def noun_phrases(doc, drop_determiners=True, min_freq=1): """ Extract an ordered list of noun phrases from a spacy-parsed doc, optionally filtering by frequency and dropping leading determiners. Args: doc (:class:`spacy.Doc()`) drop_determiners (bool, optional): remove leading determiners (e.g. "the") from phrases (e.g. "the quick brown fox" => "quick brown fox") min_freq (int, optional): remove NPs that occur in `doc` fewer than `min_freq` times Returns: list[:class:`spacy.Span()`] """ nps = doc.noun_chunks if drop_determiners is True: nps = (np if np[0].pos != DET else np[1:] for np in nps) if min_freq > 1: nps = list(nps) freqs = itertoolz.frequencies(normalized_str(np) for np in nps) nps = (np for np in nps if freqs[normalized_str(np)] >= min_freq) return list(nps) def pos_regex_matches(doc, pattern): """ Extract sequences of consecutive tokens from a spacy-parsed doc whose part-of-speech tags match the specified regex pattern. Args: doc (:class:`spacy.Doc()` or :class:`spacy.Span()`) pattern (str): Pattern of consecutive POS tags whose corresponding words are to be extracted, inspired by the regex patterns used in NLTK's `nltk.chunk.regexp`. Tags are uppercase, from the universal tag set; delimited by < and >, which are basically converted to parentheses with spaces as needed to correctly extract matching word sequences; white space in the input doesn't matter. Examples (see `regexes_etc.POS_REGEX_PATTERNS`): * noun phrase: r'? (+ )* +' * compound nouns: r'+' * verb phrase: r'?*+' * prepositional phrase: r' ? (+)* +' Returns: list[:class:`spacy.Span()`] """ # standardize and transform the regular expression pattern... pattern = re.sub(r'\s', '', pattern) pattern = re.sub(r'<([A-Z]+)\|([A-Z]+)>', r'( (\1|\2))', pattern) pattern = re.sub(r'<([A-Z]+)>', r'( \1)', pattern) tags = ' ' + ' '.join(tok.pos_ for tok in doc) return [doc[tags[0:m.start()].count(' '):tags[0:m.end()].count(' ')] for m in re.finditer(pattern, tags)] def subject_verb_object_triples(doc): """ Extract an ordered list of subject-verb-object (SVO) triples from a spacy-parsed doc. Note that this only works for SVO languages. Args: doc (:class:`spacy.Doc()` or :class:`spacy.Span()`): either a spacy document or a sentence thereof Returns: list[(`spacy.Span()`,`spacy.Span()`,`spacy.Span()`)]: where each element is a (subject, verb, object) 3-tuple # TODO: What to do about questions, where it may be VSO instead of SVO? # TODO: What about non-adjacent verb negations? # TODO: What about object (noun) negations? """ try: sents = doc.sents except AttributeError: sents = [doc] svos = [] for sent in sents: start_i = sent[0].i verbs = get_main_verbs_of_sent(sent) for verb in verbs: subjs = get_subjects_of_verb(verb) if not subjs: continue objs = get_objects_of_verb(verb) if not objs: continue # add adjacent auxiliaries to verbs, for context # and add compounds to compound nouns verb_span = get_span_for_verb_auxiliaries(verb) verb = sent[verb_span[0] - start_i: verb_span[1] - start_i + 1] for subj in subjs: subj = sent[get_span_for_compound_noun(subj)[0] - start_i: subj.i - start_i + 1] for obj in objs: if obj.pos == NOUN: span = get_span_for_compound_noun(obj) elif obj.pos == VERB: span = get_span_for_verb_auxiliaries(obj) else: span = (obj.i, obj.i) obj = sent[span[0] - start_i: span[1] - start_i + 1] svos.append((subj, verb, obj)) return svos def acronyms_and_definitions(doc, known_acro_defs=None): """ Extract a collection of acronyms and their most likely definitions, if available, from a spacy-parsed doc. If multiple definitions are found for a given acronym, only the most frequently occurring definition is returned. Args: doc (:class:`spacy.Doc()` or :class:`spacy.Span()`) known_acro_defs (dict, optional): if certain acronym/definition pairs are known, pass them in as {acronym (str): definition (str)}; algorithm will not attempt to find new definitions Returns: dict: unique acronyms (keys) with matched definitions (values) References: Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions." International Journal on Document Analysis and Recognition 1.4 (1999): 191-198. """ # process function arguments acro_defs = defaultdict(list) if not known_acro_defs: known_acronyms = set() else: for acro, defs in known_acro_defs.items(): if not isinstance(defs, list): acro_defs[acro] = [defs] known_acronyms = set(acro_defs.keys()) try: sents = doc.sents except AttributeError: sents = [doc] # iterate over sentences and their tokens for sent in sents: max_ind = len(sent) - 1 for i, token in enumerate(sent): token_ = token.text if token_ in known_acronyms or text_utils.is_acronym(token_) is False: continue # define definition search window(s) window_size = min(2 * len(token_), len(token_) + 5) windows = [sent[max(i - window_size, 0): i], sent[min(i + 1, max_ind): min(i + window_size + 1, max_ind)]] # if candidate inside (X) or -X-, only look in pre-window if 0 < i < max_ind: adjacent_tokens = sent[i - 1].text + sent[i + 1].text if adjacent_tokens in {'()', '--', '––'}: _ = windows.pop() # iterate over possible windows # filtering for valid definition strings for window in windows: window_ = window.text # window text can't be all uppercase if window_.isupper(): continue # window can't contain separating punctuation if '!' in window_ or '?' in window_ or ':' in window_ or ';' in window_: continue # acronym definition can't contain itself: no ouroboros! if token_ in window_: continue # window must contain at least one character used in acronym if not any(char in window_ for char in token_): continue definition, confidence = _get_acronym_definition( token_, window, threshold=0.8) if definition: acro_defs[token_].append((definition, confidence)) if not acro_defs.get(token_): acro_defs[token_].append(('', 0.0)) # vote by confidence score in the case of multiple definitions for acro, defs in acro_defs.items(): if len(defs) == 1: acro_defs[acro] = defs[0][0] else: acro_defs[acro] = sorted(defs, key=itemgetter(1), reverse=True)[0][0] return dict(acro_defs) def _get_acronym_definition(acronym, window, threshold=0.8): """ Identify most likely definition for an acronym given a list of tokens. Args: acronym (str): acronym for which definition is sought window (:class:`spacy.Span()`): a span of tokens from which definition extraction will be attempted threshold (float, optional): minimum "confidence" in definition required for acceptance; valid values in [0.0, 1.0]; higher value => stricter threshold Returns: (str, float): most likely definition for given acronym ('' if none found), along with the confidence assigned to it References: Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions." International Journal on Document Analysis and Recognition 1.4 (1999): 191-198. """ def build_lcs_matrix(X, Y): m = len(X) n = len(Y) b = zeros((m, n), dtype=int) c = zeros((m, n), dtype=int) for i in range(0, m): for j in range(0, n): if X[i] == Y[j]: c[i, j] = c[i - 1, j - 1] + 1 b[i, j] = 1 elif c[i - 1, j] >= c[i, j - 1]: c[i, j] = c[i - 1, j] else: c[i, j] = c[i, j - 1] return c, b def parse_lcs_matrix(b, start_i, start_j, lcs_length, stack, vectors): m = b.shape[0] n = b.shape[1] for i in range(start_i, m): for j in range(start_j, n): if b[i, j] == 1: s = (i, j) stack.append(s) if lcs_length == 1: vec = [NaN] * n for k, l in stack: vec[l] = k vectors.append(vec) else: parse_lcs_matrix(b, i + 1, j + 1, lcs_length - 1, stack, vectors) stack = [] return vectors def vector_values(v, types): vv = {} first = v.index(int(nanmin(v))) last = v.index(int(nanmax(v))) vv['size'] = (last - first) + 1 vv['distance'] = len(v) - last vv['stop_count'] = 0 vv['misses'] = 0 for i in range(first, last + 1): if v[i] >= 0 and types[i] == 's': vv['stop_count'] += 1 elif v[i] is None and types[i] not in ['s', 'h']: vv['misses'] += 1 return vv def compare_vectors(A, B, types): vv_A = vector_values(A, types) vv_B = vector_values(B, types) # no one-letter matches, sorryboutit if vv_A['size'] == 1: return B elif vv_B['size'] == 1: return A if vv_A['misses'] > vv_B['misses']: return B elif vv_A['misses'] < vv_B['misses']: return A if vv_A['stop_count'] > vv_B['stop_count']: return B if vv_A['stop_count'] < vv_B['stop_count']: return A if vv_A['distance'] > vv_B['distance']: return B elif vv_A['distance'] < vv_B['distance']: return A if vv_A['size'] > vv_B['size']: return B elif vv_A['size'] < vv_B['size']: return A return A # get definition window's leading characters and word types def_leads = [] def_types = [] for tok in window: tok_text = tok.text if tok.is_stop: def_leads.append(tok_text[0]) def_types.append('s') elif text_utils.is_acronym(tok_text): def_leads.append(tok_text[0]) def_types.append('a') elif '-' in tok_text and not tok_text.startswith('-'): tok_split = [t[0] for t in tok_text.split('-') if t] def_leads.extend(tok_split) def_types.extend('H' if i == 0 else 'h' for i in range(len(tok_split))) else: def_leads.append(tok_text[0]) def_types.append('w') def_leads = ''.join(def_leads).lower() def_types = ''.join(def_types) # extract alphanumeric characters from acronym acr_leads = ''.join(c for c in acronym if c.isalnum()) # handle special cases of '&' and trailing 's' acr_leads = acr_leads.replace('&', 'a') if acr_leads.endswith('s'): # bail out if it's only a 2-letter acronym to start with, e.g. 'Is' if len(acr_leads) == 2: return ('', 0) acr_leads = acr_leads[:-1] acr_leads = acr_leads.lower() c, b = build_lcs_matrix(acr_leads, def_leads) # 4.4.1 lcs_length = c[c.shape[0] - 1, c.shape[1] - 1] confidence = lcs_length / len(acronym) if confidence < threshold: return ('', confidence) vecs = parse_lcs_matrix(b, 0, 0, lcs_length, [], []) # first letter of acronym must be present vecs = [vec for vec in vecs if 0 in vec] if not vecs: return ('', confidence) best_vec = vecs[0] for vec in vecs[1:]: best_vec = compare_vectors(best_vec, vec, def_types) first = best_vec.index(int(nanmin(best_vec))) last = best_vec.index(int(nanmax(best_vec))) definition = window[first: last + 1].text if len(definition.split()) == 1: return ('', confidence) return (definition, confidence) def semistructured_statements(doc, entity, cue='be', ignore_entity_case=True, min_n_words=1, max_n_words=20): """ Extract "semi-structured statements" from a spacy-parsed doc, each as a (entity, cue, fragment) triple. This is similar to subject-verb-object triples. Args: doc (:class:`spacy.Doc()`) entity (str): a noun or noun phrase of some sort (e.g. "President Obama", "global warming", "Python") cue (str, optional): verb lemma with which `entity` is associated (e.g. "talk about", "have", "write") ignore_entity_case (bool, optional): if True, entity matching is case-independent min_n_words (int, optional): min number of tokens allowed in a matching fragment max_n_words (int, optional): max number of tokens allowed in a matching fragment Returns: list[(:class:`spacy.Span` or :class:`spacy.Token`, :class:`spacy.Span` or :class:`spacy.Token`, :class:`spacy.Span`)]: where each element is a (element, cue, fragment) 3-tuple Notes: Inspired by N. Diakopoulos, A. Zhang, A. Salway. Visual Analytics of Media Frames in Online News and Blogs. IEEE InfoVis Workshop on Text Visualization. October, 2013. Which itself was inspired by by Salway, A.; Kelly, L.; Skadiņa, I.; and Jones, G. 2010. Portable Extraction of Partially Structured Facts from the Web. In Proc. ICETAL 2010, LNAI 6233, 345-356. Heidelberg, Springer. """ if ignore_entity_case is True: entity_toks = entity.lower().split(' ') get_tok_text = lambda x: x.lower_ else: entity_toks = entity.split(' ') get_tok_text = lambda x: x.text first_entity_tok = entity_toks[0] n_entity_toks = len(entity_toks) cue = cue.lower() cue_toks = cue.split(' ') n_cue_toks = len(cue_toks) def is_good_last_tok(tok): if tok.is_punct: return False if tok.pos in {CONJ, DET}: return False return True statements = [] for sent in doc.sents: for tok in sent: # filter by entity if get_tok_text(tok) != first_entity_tok: continue if n_entity_toks == 1: the_entity = tok the_entity_root = the_entity if tok.i + n_cue_toks >= len(doc): continue elif all(get_tok_text(tok.nbor(i=i + 1)) == et for i, et in enumerate(entity_toks[1:])): the_entity = doc[tok.i: tok.i + n_entity_toks] the_entity_root = the_entity.root else: continue # filter by cue terh = the_entity_root.head if terh.lemma_ != cue_toks[0]: continue if n_cue_toks == 1: min_cue_i = terh.i max_cue_i = terh.i + n_cue_toks the_cue = terh elif all(terh.nbor(i=i + 1).lemma_ == ct for i, ct in enumerate(cue_toks[1:])): min_cue_i = terh.i max_cue_i = terh.i + n_cue_toks the_cue = doc[terh.i: max_cue_i] else: continue if the_entity_root in the_cue.rights: continue # now add adjacent auxiliary and negating tokens to the cue, for context try: min_cue_i = min(left.i for left in takewhile( lambda x: x.dep_ in {'aux', 'neg'}, reversed(list(the_cue.lefts)))) except ValueError: pass try: max_cue_i = max(right.i for right in takewhile( lambda x: x.dep_ in {'aux', 'neg'}, the_cue.rights)) except ValueError: pass if max_cue_i - min_cue_i > 1: the_cue = doc[min_cue_i: max_cue_i] else: the_cue = doc[min_cue_i] # filter by fragment try: min_frag_i = min(right.left_edge.i for right in the_cue.rights) max_frag_i = max(right.right_edge.i for right in the_cue.rights) except ValueError: continue while is_good_last_tok(doc[max_frag_i]) is False: max_frag_i -= 1 n_fragment_toks = max_frag_i - min_frag_i if n_fragment_toks <= 0 or n_fragment_toks < min_n_words or n_fragment_toks > max_n_words: continue # HACK... if min_frag_i == max_cue_i - 1: min_frag_i += 1 the_fragment = doc[min_frag_i: max_frag_i + 1] statements.append((the_entity, the_cue, the_fragment)) return statements def direct_quotations(doc): """ Baseline, not-great attempt at direction quotation extraction (no indirect or mixed quotations) using rules and patterns. English only. Args: doc (:class:`spacy.Doc()`) Returns: list[(:class:`spacy.Span`, :class:`spacy.Token`, :class:`spacy.Span`)]: where each element is a (speaker, reporting verb, quotation) 3-tuple Notes: Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic Tagging of Reported Speech in Newspaper Articles". TODO: Better approach would use ML, but needs a training dataset. TODO: why are "''" pairs giving problems -- is it just Friedman?! """ quotations = [] quote_end_punct = {',', '.', '?', '!'} quote_indexes = set(itertoolz.concat( (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string))) quote_positions = list(itertoolz.partition( 2, sorted(tok.i for tok in doc if tok.idx in quote_indexes))) sents = list(doc.sents) sent_positions = [(sent.start, sent.end) for sent in sents] for q0, q1 in quote_positions: quote = doc[q0: q1 + 1] # we're only looking for direct quotes, not indirect or mixed if not any(char in quote_end_punct for char in quote.text[-4:]): continue # get adjacent sentences candidate_sent_indexes = [] for i, (s0, s1) in enumerate(sent_positions): if s0 <= q1 + 1 and s1 > q1: candidate_sent_indexes.append(i) elif s0 < q0 and s1 >= q0 - 1: candidate_sent_indexes.append(i) for si in candidate_sent_indexes: sent = sents[si] # get any reporting verbs rvs = [tok for tok in sent if spacy_utils.preserve_case(tok) is False and tok.lemma_ in REPORTING_VERBS and tok.pos_ == 'VERB' and not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)] # get target offset against which to measure distances of NEs if rvs: if len(rvs) == 1: rv = rvs[0] else: min_rv_dist = 1000 for rv_candidate in rvs: rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1)) if rv_dist < min_rv_dist: rv = rv_candidate min_rv_dist = rv_dist else: break else: # TODO: do we have no other recourse?! continue try: # rv_subj = _find_subjects(rv)[0] rv_subj = get_subjects_of_verb(rv)[0] except IndexError: continue # if rv_subj.text in {'he', 'she'}: # for ne in named_entities(doc, good_ne_types={'PERSON'}): # if ne.start < rv_subj.i: # speaker = ne # else: # break # else: span = get_span_for_compound_noun(rv_subj) speaker = doc[span[0]: span[1] + 1] quotations.append((speaker, rv, quote)) break return quotations PK 1 have df assumed = 1); NOTE: results are better with idf information Returns: list[(str, float)]: sorted list of top `n_keyterms` key terms and their corresponding SGRank scores Raises: ValueError: if `n_keyterms` is a float but not in (0.0, 1.0] References: Danesh, Sumner, and Martin. "SGRank: Combining Statistical and Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117. """ if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_toks = len(doc) min_term_freq = min(n_toks // 1500, 4) # build full list of candidate terms terms = list(itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'NOUN', 'ADJ'}, min_freq=min_term_freq) for n in range(1, 7))) # if inverse document frequencies available, also add verbs # verbs without IDF downweighting dominate the results, and not in a good way if idf: terms.extend(itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'VERB'}, min_freq=min_term_freq) for n in range(1, 7))) terms_as_strs = {id(term): spacy_utils.normalized_str(term) for term in terms} # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available n_top_20pct = int(len(terms) * 0.2) term_counts = Counter(terms_as_strs[id(term)] for term in terms) if idf: mod_tfidfs = {term: count * idf[term] if ' ' not in term else count for term, count in term_counts.items()} top_term_texts = {term for term, _ in sorted( mod_tfidfs.items(), key=itemgetter(1), reverse=True)[:n_top_20pct]} else: top_term_texts = {term for term, _ in term_counts.most_common(n_top_20pct)} terms = [term for term in terms if terms_as_strs[id(term)] in top_term_texts] # compute term weights from statistical attributes term_weights = {} set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms} n_toks_plus_1 = n_toks + 1 for term in terms: term_str = terms_as_strs[id(term)] pos_first_occ_factor = log(n_toks_plus_1 / (term.start + 1)) # TODO: assess if len(t) puts too much emphasis on long terms # alternative: term_len = 1 if ' ' not in term else sqrt(len(term)) term_len = 1 if ' ' not in term else len(term) term_count = term_counts[term_str] subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str if t2 != term_str and term_str in t2) term_freq_factor = (term_count - subsum_count) if idf and ' ' not in term_str: term_freq_factor *= idf[term_str] term_weights[term_str] = term_freq_factor * pos_first_occ_factor * term_len # filter terms to only those with positive weights terms = [term for term in terms if term_weights[terms_as_strs[id(term)]] > 0] n_coocs = defaultdict(lambda: defaultdict(int)) sum_logdists = defaultdict(lambda: defaultdict(float)) # iterate over windows for start_ind in range(n_toks): end_ind = start_ind + window_width window_terms = (term for term in terms if start_ind <= term.start <= end_ind) # get all token combinations within window for t1, t2 in itertools.combinations(window_terms, 2): if t1 is t2: continue n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1 try: sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ log(window_width / abs(t1.start - t2.start)) except ZeroDivisionError: # HACK: pretend that they're 1 token apart sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ log(window_width) if end_ind > n_toks: break # compute edge weights between co-occurring terms (nodes) edge_weights = defaultdict(lambda: defaultdict(float)) for t1, t2s in sum_logdists.items(): for t2 in t2s: edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2]) * term_weights[t1] * term_weights[t2] # normalize edge weights by sum of outgoing edge weights per term (node) norm_edge_weights = [] for t1, t2s in edge_weights.items(): sum_edge_weights = sum(t2s.values()) norm_edge_weights.extend((t1, t2, {'weight': weight / sum_edge_weights}) for t2, weight in t2s.items()) # build the weighted directed graph from edges, rank nodes by pagerank graph = nx.DiGraph() graph.add_edges_from(norm_edge_weights) term_ranks = nx.pagerank_scipy(graph) if isinstance(n_keyterms, float): n_keyterms = int(len(term_ranks) * n_keyterms) return sorted(term_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms] def textrank(doc, n_keyterms=10): """ Convenience function for calling :func:`key_terms_from_semantic_network` with the parameter values used in the TextRank algorithm. Args: doc (:class:`spacy.Doc()`) n_keyterms (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms Returns: see :func:`key_terms_from_semantic_network`. References: Mihalcea, R., & Tarau, P. (2004, July). TextRank: Bringing order into texts. Association for Computational Linguistics. """ return key_terms_from_semantic_network( doc, window_width=2, edge_weighting='binary', ranking_algo='pagerank', join_key_words=False, n_keyterms=n_keyterms) def singlerank(doc, n_keyterms=10): """ Convenience function for calling :func:`key_terms_from_semantic_network` with the parameter values used in the SingleRank algorithm. Args: doc (:class:`spacy.Doc()`) n_keyterms (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms Returns: see :func:`key_terms_from_semantic_network`. References: Hasan, K. S., & Ng, V. (2010, August). Conundrums in unsupervised keyphrase extraction: making sense of the state-of-the-art. In Proceedings of the 23rd International Conference on Computational Linguistics: Posters (pp. 365-373). Association for Computational Linguistics. """ return key_terms_from_semantic_network( doc, window_width=10, edge_weighting='cooc_freq', ranking_algo='pagerank', join_key_words=True, n_keyterms=n_keyterms) def key_terms_from_semantic_network(doc, window_width=2, edge_weighting='binary', ranking_algo='pagerank', join_key_words=False, n_keyterms=10, **kwargs): """ Extract key terms from a document by ranking nodes in a semantic network of terms, connected by edges and weights specified by parameters. Args: doc (:class:`spacy.Doc()`): window_width (int, optional): width of sliding window in which term co-occurrences are said to occur edge_weighting (str {'binary', 'cooc_freq'}, optional): method used to determine weights of edges between nodes in the semantic network; if 'binary', edge weight is set to 1 for any two terms co-occurring within `window_width` terms; if 'cooc_freq', edge weight is set to the number of times that any two terms co-occur ranking_algo (str {'pagerank', 'divrank', 'bestcoverage'}, optional): algorithm with which to rank nodes in the semantic network; `pagerank` is the canonical (and default) algorithm, but it prioritizes node centrality at the expense of node diversity; the other two attempt to balance centrality with diversity join_key_words (bool, optional): if True, join consecutive key words together into longer key terms, taking the sum of the constituent words' scores as the joined key term's combined score n_keyterms (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms Returns: list[(str, float)]: sorted list of top `n_keyterms` key terms and their corresponding SGRank scores Raises: ValueError: if `n_keyterms` is a float but not in (0.0, 1.0] """ word_list = [spacy_utils.normalized_str(word) for word in doc] good_word_list = [spacy_utils.normalized_str(word) for word in doc if not word.is_stop and not word.is_punct and word.pos_ in {'NOUN', 'ADJ'}] if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_keyterms = int(n_keyterms * len(set(good_word_list))) graph = terms_to_semantic_network( good_word_list, window_width=window_width, edge_weighting=edge_weighting) # rank nodes by algorithm, and sort in descending order if ranking_algo == 'pagerank': word_ranks = nx.pagerank_scipy(graph, weight='weight') elif ranking_algo == 'divrank': word_ranks = rank_nodes_by_divrank( graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5)) elif ranking_algo == 'bestcoverage': word_ranks = rank_nodes_by_bestcoverage( graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0)) # bail out here if all we wanted was key *words* and not *terms* if join_key_words is False: return [(word, score) for word, score in sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]] top_n = int(0.25 * len(word_ranks)) top_word_ranks = {word: rank for word, rank in sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:top_n]} # join consecutive key words into key terms seen_joined_key_terms = set() joined_key_terms = [] for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks): if key is True: words = list(group) term = ' '.join(words) if term in seen_joined_key_terms: continue seen_joined_key_terms.add(term) joined_key_terms.append((term, sum(word_ranks[word] for word in words))) return sorted(joined_key_terms, key=itemgetter(1), reverse=True)[:n_keyterms] def aggregate_term_variants(terms, acro_defs=None, fuzzy_dedupe=True): """ Take a set of unique terms and aggregate terms that are symbolic, lexical, and ordering variants of each other, as well as acronyms and fuzzy string matches. Args: terms (set[str]): set of unique terms with potential duplicates acro_defs (dict, optional): if not None, terms that are acronyms will be aggregated with their definitions and terms that are definitions will be aggregated with their acronyms fuzzy_dedupe (bool, optional): if True, fuzzy string matching will be used to aggregate similar terms of a sufficient length using `FuzzyWuzzy `_ Returns: list[set]: each item is a set of aggregated terms Notes: Partly inspired by aggregation of variants discussed in Park, Youngja, Roy J. Byrd, and Branimir K. Boguraev. "Automatic glossary extraction: beyond terminology identification." Proceedings of the 19th international conference on Computational linguistics-Volume 1. Association for Computational Linguistics, 2002. """ # TODO: decide if this would be useful # if lemmatizer is None: # from nltk.stem import WordNetLemmatizer # lemmatizer = WordNetLemmatizer() agg_terms = [] seen_terms = set() for term in sorted(terms, key=len, reverse=True): if term in seen_terms: continue variants = set([term]) seen_terms.add(term) # symbolic variations if '-' in term: variant = term.replace('-', ' ').strip() if variant in terms.difference(seen_terms): variants.add(variant) seen_terms.add(variant) if '/' in term: variant = term.replace('/', ' ').strip() if variant in terms.difference(seen_terms): variants.add(variant) seen_terms.add(variant) # lexical variations term_words = term.split() # last_word = term_words[-1] # # assume last word is a noun # last_word_lemmatized = lemmatizer.lemmatize(last_word, 'n') # # if the same, either already a lemmatized noun OR a verb; try verb # if last_word_lemmatized == last_word: # last_word_lemmatized = lemmatizer.lemmatize(last_word, 'v') # # if at least we have a new term... add it # if last_word_lemmatized != last_word: # term_lemmatized = ' '.join(term_words[:-1] + [last_word_lemmatized]) # if term_lemmatized in terms.difference(seen_terms): # variants.add(term_lemmatized) # seen_terms.add(term_lemmatized) # if term is an acronym, add its definition # if term is a definition, add its acronym if acro_defs: for acro, def_ in acro_defs.items(): if acro.lower() == term.lower(): variants.add(def_.lower()) seen_terms.add(def_.lower()) break elif def_.lower() == term.lower(): variants.add(acro.lower()) seen_terms.add(acro.lower()) break # if 3+ -word term differs by one word at the start or the end # of a longer phrase, aggregate if len(term_words) > 2: term_minus_first_word = ' '.join(term_words[1:]) term_minus_last_word = ' '.join(term_words[:-1]) if term_minus_first_word in terms.difference(seen_terms): variants.add(term_minus_first_word) seen_terms.add(term_minus_first_word) if term_minus_last_word in terms.difference(seen_terms): variants.add(term_minus_last_word) seen_terms.add(term_minus_last_word) # check for "X of Y" <=> "Y X" term variants if ' of ' in term: split_term = term.split(' of ') variant = split_term[1] + ' ' + split_term[0] if variant in terms.difference(seen_terms): variants.add(variant) seen_terms.add(variant) # intense de-duping via fuzzywuzzy for sufficiently long terms if fuzzy_dedupe is True and len(term) >= 13: for other_term in sorted(terms.difference(seen_terms), key=len, reverse=True): if len(other_term) < 13: break tsr = token_sort_ratio(term, other_term) if tsr > 93: variants.add(other_term) seen_terms.add(other_term) break agg_terms.append(variants) return agg_terms def rank_nodes_by_bestcoverage(graph, k, c=1, alpha=1.0): """ Rank nodes in a network using the BestCoverage algorithm that attempts to balance between node centrality and diversity. Args: graph (:class:`networkx.Graph()`) k (int): number of results to return for top-k search c (int, optional): *l* parameter for *l*-step expansion; best if 1 or 2 alpha (float, optional): float in [0.0, 1.0] specifying how much of central vertex's score to remove from its *l*-step neighbors; smaller value puts more emphasis on centrality, larger value puts more emphasis on diversity Returns: dict: top `k` nodes as ranked by bestcoverage algorithm; keys as node identifiers, values as corresponding ranking scores References: Küçüktunç, O., Saule, E., Kaya, K., & Çatalyürek, Ü. V. (2013, May). Diversified recommendation on graphs: pitfalls, measures, and algorithms. In Proceedings of the 22nd international conference on World Wide Web (pp. 715-726). International World Wide Web Conferences Steering Committee. http://www2013.wwwconference.org/proceedings/p715.pdf """ alpha = float(alpha) nodes_list = graph.nodes() # ranks: array of PageRank values, summing up to 1 ranks = nx.pagerank_scipy(graph, alpha=0.85, max_iter=100, tol=1e-08, weight='weight') sorted_ranks = sorted(ranks.items(), key=itemgetter(1), reverse=True) avg_degree = sum(deg for _, deg in graph.degree_iter()) / len(nodes_list) # relaxation parameter, k' in the paper k_prime = int(k * avg_degree * c) top_k_sorted_ranks = sorted_ranks[:k_prime] def get_l_step_expanded_set(vertices, l): """ Args: vertices (iterable[str]): vertices to be expanded l (int): how many steps to expand vertices set Returns: set: the l-step expanded set of vertices """ # add vertices to s s = set(vertices) # s.update(vertices) # for each step for _ in range(l): # for each node next_vertices = [] for vertex in vertices: # add its neighbors to the next list neighbors = graph.neighbors(vertex) next_vertices.extend(neighbors) s.update(neighbors) vertices = set(next_vertices) return s top_k_exp_vertices = get_l_step_expanded_set([item[0] for item in top_k_sorted_ranks], c) # compute initial exprel contribution taken = defaultdict(bool) contrib = {} for vertex in nodes_list: # get l-step expanded set s = get_l_step_expanded_set([vertex], c) # sum up neighbors ranks, i.e. l-step expanded relevance contrib[vertex] = sum(ranks[v] for v in s) sum_contrib = 0.0 results = {} # greedily select to maximize exprel metric for _ in range(k): if not contrib: # TODO: check that .items(): not needed break # find word with highest l-step expanded relevance score max_word_score = sorted(contrib.items(), key=itemgetter(1), reverse=True)[0] sum_contrib += max_word_score[1] # contrib[max_word[0]] results[max_word_score[0]] = max_word_score[1] # find its l-step expanded set l_step_expanded_set = get_l_step_expanded_set([max_word_score[0]], c) # for each vertex found for vertex in l_step_expanded_set: # already removed its contribution from neighbors if taken[vertex] is True: continue # remove the contribution of vertex (or some fraction) from its l-step neighbors s1 = get_l_step_expanded_set([vertex], c) for w in s1: try: contrib[w] -= alpha * ranks[vertex] except KeyError: print('***ERROR: word', w, 'not in contrib dict! We\'re approximating...') taken[vertex] = True contrib[max_word_score[0]] = 0 return results def rank_nodes_by_divrank(graph, r=None, lambda_=0.5, alpha=0.5): """ Rank nodes in a network using the DivRank algorithm that attempts to balance between node centrality and diversity. Args: graph (:class:`networkx.Graph()`): r (:class:`numpy.array`, optional): the "personalization vector"; by default, `r = ones(1, n)/n` lambda_ (float, optional): must be in [0.0, 1.0] alpha (float, optional): controls the strength of self-links; must be in [0.0, 1.0] Returns: list[tuple]: list of (node, score) tuples ordered by (desc.) divrank score References: Mei, Q., Guo, J., & Radev, D. (2010, July). Divrank: the interplay of prestige and diversity in information networks. In Proceedings of the 16th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 1009-1018). ACM. http://clair.si.umich.edu/~radev/papers/SIGKDD2010.pdf """ # check function arguments if len(graph) == 0: print('**WARNING: Graph graph is empty!') return {} # create adjacency matrix, i.e. # n x n matrix where entry W_ij is the weight of the edge from V_i to V_j W = nx.to_numpy_matrix(graph, weight='weight').A n = W.shape[1] # create flat prior personalization vector if none given if r is None: r = np.array([n * [1 / float(n)]]) # Specify some constants max_iter = 1000 diff = 1e+10 tol = 1e-3 pr = np.array([n * [1 / float(n)]]) # Get p0(v -> u), i.e. transition probability prior to reinforcement tmp = np.reshape(np.sum(W, axis=1), (n, 1)) idx_nan = np.flatnonzero(tmp == 0) W0 = W / np.tile(tmp, (1, n)) W0[idx_nan, :] = 0 del W # DivRank algorithm i = 0 while i < max_iter and diff > tol: W1 = alpha * W0 * np.tile(pr, (n, 1)) W1 = W1 - np.diag(W1[:, 0]) + (1 - alpha) * np.diag(pr[0, :]) tmp1 = np.reshape(np.sum(W1, axis=1), (n, 1)) P = W1 / np.tile(tmp1, (1, n)) P = ((1 - lambda_) * P) + (lambda_ * np.tile(r, (n, 1))) pr_new = np.dot(pr, P) i += 1 diff = np.sum(np.abs(pr_new - pr)) / np.sum(pr) pr = pr_new # sort nodes by divrank score results = sorted(((i, score) for i, score in enumerate(pr.flatten().tolist())), key=itemgetter(1), reverse=True) # replace node number by node value nodes_list = graph.nodes() divranks = {nodes_list[result[0]]: result[1] for result in results} return divranks PKCDHCIrrtextacy/lexicon_methods.py""" Collection of lexicon-based methods for characterizing texts by sentiment, emotional valence, etc. """ from collections import defaultdict from spacy.parts_of_speech import ADJ, ADV, NOUN, VERB from textacy import data # TODO: Do something smarter for averaging emotional valences. def emotional_valence(words, threshold=0.0, dm_data_dir=None, dm_weighting='normfreq'): """ Get average emotional valence over all words for the following emotions: AFRAID, AMUSED, ANGRY, ANNOYED, DONT_CARE, HAPPY, INSPIRED, SAD. Args: words (list[:class:`spacy.Token()`]): list of words for which to get average emotional valence; note that only nouns, adjectives, adverbs, and verbs will be counted threshold (float, optional): minimum emotional valence score for which to count a given word for a given emotion; value must be in [0.0, 1.0) dm_data_dir (str, optional): full path to directory where DepecheMood data is saved on disk dm_weighting (str, {'freq', 'normfreq', 'tfidf'}, optional): type of word weighting used in building DepecheMood matrix Returns: dict: mapping of emotion (str) to average valence score (float) References: Data available at https://github.com/marcoguerini/DepecheMood/releases Staiano and Guerini. DepecheMood: a Lexicon for Emotion Analysis from Crowd-Annotated News. 2014. """ dm = data.load_depechemood(data_dir=dm_data_dir, weighting=dm_weighting) pos_to_letter = {NOUN: 'n', ADJ: 'a', ADV: 'r', VERB: 'v'} emo_matches = defaultdict(int) emo_scores = defaultdict(float) for word in words: if word.pos in pos_to_letter: lemma_pos = word.lemma_ + '#' + pos_to_letter[word.pos] try: for emo, score in dm[lemma_pos].items(): if score > threshold: emo_matches[emo] += 1 emo_scores[emo] += score except KeyError: continue for emo in emo_scores: emo_scores[emo] /= emo_matches[emo] return emo_scores PKGDHgtextacy/math_utils.py""" Set of small utility functions that do mathy stuff. """ from __future__ import division import numpy as np # TODO: make this module actually good and useful def cosine_similarity(vec1, vec2): """ Return the cosine similarity between two vectors. Args: vec1 (:class:`numpy.array`) vec2 (:class:`numpy.array`) Returns: float """ return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) def levenshtein_distance(str1, str2): """ Function to find the Levenshtein distance between two words/sentences; gotten from http://rosettacode.org/wiki/Levenshtein_distance#Python Args: str1 (str) str2 (str) Returns: int """ if len(str1) > len(str2): str1, str2 = str2, str1 distances = range(len(str1) + 1) for index2, char2 in enumerate(str2): new_distances = [index2 + 1] for index1, char1 in enumerate(str1): if char1 == char2: new_distances.append(distances[index1]) else: new_distances.append(1 + min((distances[index1], distances[index1 + 1], new_distances[-1]))) distances = new_distances return distances[-1] PK#DHN`S#S#textacy/preprocess.py# -*- coding: utf-8 -*- """ Functions that modify raw text *in-place*, replacing contractions, URLs, emails, phone numbers, and currency symbols with standardized forms. These should be applied before processing by spacy (http://spacy.io), but be warned: preprocessing may affect the interpretation of the text -- and spacy's processing of it. """ from __future__ import absolute_import, division, print_function, unicode_literals import re import unicodedata from ftfy import fix_text from unidecode import unidecode from textacy.regexes_etc import (CURRENCIES, URL_REGEX, SHORT_URL_REGEX, EMAIL_REGEX, PHONE_REGEX, NUMBERS_REGEX, PUNCT_REGEX, CURRENCY_REGEX, LINEBREAK_REGEX, NONBREAKING_SPACE_REGEX) def fix_bad_unicode(text, normalization='NFC'): """ Fix unicode text that's "broken" using ftfy (http://ftfy.readthedocs.org/); this includes mojibake, HTML entities and other code cruft, and non-standard forms for display purposes. Args: text (str): raw text normalization (str {'NFC', 'NFKC', 'NFD', 'NFKD'}, optional): if 'NFC', combines characters and diacritics written using separate code points, e.g. converting "e" plus an acute accent modifier into "é"; unicode can be converted to NFC form without any change in its meaning! if 'NFKC', additional normalizations are applied that can change the meanings of characters, e.g. ellipsis characters will be replaced with three periods Returns: str """ return fix_text(text, normalization=normalization) def transliterate_unicode(text): """ Try to represent unicode data in ascii characters similar to what a human with a US keyboard would choose. Works great for languages of Western origin, worse the farther the language gets from Latin-based alphabets. It's based on hand-tuned character mappings that also contain ascii approximations for symbols and non-Latin alphabets. """ return unidecode(text) def normalize_whitespace(text): """ Given str `text`, replace one or more spacings with a single space, and one or more linebreaks with a single newline. Also strip leading/trailing whitespace. """ return NONBREAKING_SPACE_REGEX.sub(' ', LINEBREAK_REGEX.sub(r'\n', text)).strip() def unpack_contractions(text): """ Replace *English* contractions in `text` with their unshortened forms. N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive), so are left as-is. """ # standard text = re.sub(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't", r"\1\2 not", text) text = re.sub(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll", r"\1\2 will", text) text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text) text = re.sub(r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve", r"\1\2 have", text) # non-standard text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text) text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text) text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text) text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text) text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text) text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text) return text def replace_urls(text, replace_with='*URL*'): """Replace all URLs in str `text` with str `replace_with`.""" return URL_REGEX.sub(replace_with, SHORT_URL_REGEX.sub(replace_with, text)) def replace_emails(text, replace_with='*EMAIL*'): """Replace all emails in str `text` with str `replace_with`.""" return EMAIL_REGEX.sub(replace_with, text) def replace_phone_numbers(text, replace_with='*PHONE*'): """Replace all phone numbers in str `text` with str `replace_with`.""" return PHONE_REGEX.sub(replace_with, text) def replace_numbers(text, replace_with='*NUMBER*'): """Replace all numbers in str `text` with str `replace_with`.""" return NUMBERS_REGEX.sub(replace_with, text) def remove_punct(text): """Remove all punctuation from str `text` (replace punct marks with empty string).""" return PUNCT_REGEX.sub('', text) def replace_currency_symbols(text, replace_with=None): """ Replace all currency symbols in str `text` with string specified by `replace_with`. Args: text (str): raw text replace_with (str, optional): if None (default), replace symbols with their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP'); otherwise, pass in a string with which to replace all symbols (e.g. "*CURRENCY*") Returns: str """ if replace_with is None: for k, v in CURRENCIES.items(): text = text.replace(k, v) return text else: return CURRENCY_REGEX.sub(replace_with, text) def remove_accents(text, method='unicode'): """ Remove accents from any accented unicode characters in str `text`, either by transforming them into ascii equivalents or removing them entirely. Args: text (str): raw text method (str {'unicode', 'ascii'}, optional): if 'unicode', remove accented char for any unicode symbol with a direct ASCII equivalent; if 'ascii', remove accented char for any unicode symbol NB: the 'ascii' method is notably faster than 'unicode', but less good Returns: str Raises: ValueError: if `method` is not in {'unicode', 'ascii'} """ if method == 'unicode': return ''.join(c for c in unicodedata.normalize('NFKD', text) if not unicodedata.combining(c)) elif method == 'ascii': return unicodedata.normalize('NFKD', text).encode('ascii', errors='ignore').decode('ascii') else: msg = '`method` must be either "unicode" and "ascii", not {}'.format(method) raise ValueError(msg) def preprocess_text(text, fix_unicode=False, lowercase=False, transliterate=False, no_urls=False, no_emails=False, no_phone_numbers=False, no_numbers=False, no_currency_symbols=False, no_punct=False, no_contractions=False, no_accents=False): """ Normalize various aspects of a raw text doc before parsing it with Spacy. A convenience function for applying all other preprocessing functions in one go. WARNING: These changes may negatively affect subsequent NLP analysis performed on the text, so choose carefully, and preprocess at your own risk! Args: text (str): raw text to preprocess fix_unicode (bool, optional): if True, fix "broken" unicode such as mojibake and garbled HTML entities lowercase (bool, optional): if True, all text is lower-cased transliterate (bool, optional): if True, convert non-ascii characters into their closest ascii equivalents no_urls (bool, optional): if True, replace all URL strings with '*URL*' no_emails (bool, optional): if True, replace all email strings with '*EMAIL*' no_phone_numbers (bool, optional): if True, replace all phone number strings with '*PHONE*' no_numbers (bool, optional): if True, replace all number-like strings with '*NUMBER*' no_currency_symbols (bool, optional): if True, replace all currency symbols with their standard 3-letter abbreviations no_punct (bool, optional): if True, remove all punctuation (replace with empty string) no_contractions (bool, optional): if True, replace *English* contractions with their unshortened forms no_accents (bool, optional): if True, replace all accented characters with unaccented versions; NB: if `transliterate` is True, this option is redundant Returns: str: input `text` processed according to function args """ if fix_unicode is True: text = fix_bad_unicode(text, normalization='NFC') if transliterate is True: text = transliterate_unicode(text) if no_urls is True: text = replace_urls(text) if no_emails is True: text = replace_emails(text) if no_phone_numbers is True: text = replace_phone_numbers(text) if no_numbers is True: text = replace_numbers(text) if no_currency_symbols is True: text = replace_currency_symbols(text) if no_contractions is True: text = unpack_contractions(text) if no_accents is True: text = remove_accents(text, method='unicode') if no_punct is True: text = remove_punct(text) if lowercase is True: text = text.lower() # always normalize whitespace; treat linebreaks separately from spacing text = normalize_whitespace(text) return text PKGDH4*Neetextacy/regexes_etc.py# -*- coding: utf-8 -*- """ Collection of regular expressions and other (small, generally useful) constants. """ from __future__ import unicode_literals import re import string NUMERIC_NE_TYPES = {'ORDINAL', 'CARDINAL', 'MONEY', 'QUANTITY', 'PERCENT', 'TIME', 'DATE'} SUBJ_DEPS = {'agent', 'csubj', 'csubjpass', 'expl', 'nsubj', 'nsubjpass'} OBJ_DEPS = {'attr', 'dobj', 'dative', 'oprd'} AUX_DEPS = {'aux', 'auxpass', 'neg'} REPORTING_VERBS = {'according', 'accuse', 'acknowledge', 'add', 'admit', 'agree', 'allege', 'announce', 'argue', 'ask', 'assert', 'believe', 'blame', 'charge', 'cite', 'claim', 'complain', 'concede', 'conclude', 'confirm', 'contend', 'criticize', 'declare', 'decline', 'deny', 'describe', 'disagree', 'disclose', 'estimate', 'explain', 'fear', 'hope', 'insist', 'maintain', 'mention', 'note', 'observe', 'order', 'predict', 'promise', 'recall', 'recommend', 'reply', 'report', 'say', 'state', 'stress', 'suggest', 'tell', 'testify', 'think', 'urge', 'warn', 'worry', 'write'} CURRENCIES = {'$': 'USD', 'zł': 'PLN', '£': 'GBP', '¥': 'JPY', '฿': 'THB', '₡': 'CRC', '₦': 'NGN', '₩': 'KRW', '₪': 'ILS', '₫': 'VND', '€': 'EUR', '₱': 'PHP', '₲': 'PYG', '₴': 'UAH', '₹': 'INR'} POS_REGEX_PATTERNS = { 'en': {'NP': r'? * ( ? ?)* ( ?)+', 'PP': r' ? * ( ? ?)* ( ?)+', 'VP': r'* * '} } ACRONYM_REGEX = re.compile(r"(?:^|(?<=\W))(?:(?:(?:(?:[A-Z]\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\-?[A-Z])+))(?:$|(?=\W))") EMAIL_REGEX = re.compile(r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))", flags=re.IGNORECASE) PHONE_REGEX = re.compile(r'(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?\d{3}[ .-]?\d{4}(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))') NUMBERS_REGEX = re.compile(r'(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))') PUNCT_REGEX = re.compile('[{0}]+'.format(re.escape(string.punctuation))) CURRENCY_REGEX = re.compile('[{0}]+'.format(''.join(CURRENCIES.keys()))) LINEBREAK_REGEX = re.compile(r'((\r\n)|[\n\v])+') NONBREAKING_SPACE_REGEX = re.compile(r'(?!\n)\s+') URL_REGEX = re.compile( r"(?:^|(?= 224.0.0.0 # excludes network & broadcast addresses # (first & last IP address of each class) r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host name r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)" # domain name r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*" # TLD identifier r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")" # port number r"(?::\d{2,5})?" # resource path r"(?:/\S*)?" r"(?:$|(?![\w?!+&/]))" , flags=re.UNICODE | re.IGNORECASE) # source: https://gist.github.com/dperini/729294 SHORT_URL_REGEX = re.compile( r"(?:^|(? 1: mask &= dfs >= min_doc_count if max_n_terms is not None and mask.sum() > max_n_terms: tfs = get_term_freqs(term_doc_matrix, normalized=False) top_mask_inds = (tfs[mask]).argsort()[::-1][:max_n_terms] new_mask = np.zeros(n_terms, dtype=bool) new_mask[np.where(mask)[0][top_mask_inds]] = True mask = new_mask # map old term indices to new ones new_indices = np.cumsum(mask) - 1 id_to_term = {new_indices[old_index]: term for old_index, term in id_to_term.items() if mask[old_index]} kept_indices = np.where(mask)[0] if len(kept_indices) == 0: msg = 'After filtering, no terms remain; try a lower `min_df` or higher `max_df`' raise ValueError(msg) return term_doc_matrix[:, kept_indices], id_to_term def filter_terms_by_ic(term_doc_matrix, id_to_term, min_ic=0.0, max_n_terms=None): """ Filter out terms that are too common and/or too rare (by information content), and compactify the top `max_n_terms` in the `id_to_term` mapping accordingly. Borrows heavily from the sklearn.feature_extraction.text module. Args: term_doc_matrix (:class:`scipy.sparse.csr_matrix`) id_to_term (dict): mapping of unique integer term identifiers to corresponding normalized strings as values min_ic (float, optional): filter terms whose information content is less than this value; must be in [0.0, 1.0] max_n_terms (int, optional): only include terms whose information content is within the top `max_n_terms` Returns: (:class:`scipy.sparse.csr_matrix`, dict): 2-tuple of the filtered `term_doc_matrix` and `id_to_term` Raises: ValueError: if `min_ic` not in [0.0, 1.0] or `max_n_terms` < 0 """ if min_ic == 0.0 and max_n_terms is None: return term_doc_matrix, id_to_term if min_ic < 0.0 or min_ic > 1.0: raise ValueError('min_ic must be a float in [0.0, 1.0]') if max_n_terms is not None and max_n_terms < 0: raise ValueError('max_n_terms may not be negative') _, n_terms = term_doc_matrix.shape # calculate a mask based on document frequencies ics = get_information_content(term_doc_matrix) mask = np.ones(n_terms, dtype=bool) if min_ic > 0.0: mask &= ics >= min_ic if max_n_terms is not None and mask.sum() > max_n_terms: top_mask_inds = (ics[mask]).argsort()[::-1][:max_n_terms] new_mask = np.zeros(n_terms, dtype=bool) new_mask[np.where(mask)[0][top_mask_inds]] = True mask = new_mask # map old term indices to new ones new_indices = np.cumsum(mask) - 1 id_to_term = {new_indices[old_index]: term for old_index, term in id_to_term.items() if mask[old_index]} kept_indices = np.where(mask)[0] if len(kept_indices) == 0: raise ValueError('After filtering, no terms remain; try a lower `min_ic`') return term_doc_matrix[:, kept_indices], id_to_term def readability_stats(doc): """ Get calculated values for a variety of statistics related to the "readability" of a text: Flesch-Kincaid Grade Level, Flesch Reading Ease, SMOG Index, Gunning-Fog Index, Coleman-Liau Index, and Automated Readability Index. Also includes constituent values needed to compute the stats, e.g. word count. Args: doc (:class:`texts.TextDoc()`) Returns: dict: mapping of readability statistic name (str) to value (int or float) Raises: NotImplementedError: if `doc` is not English language. sorry. """ if doc.lang != 'en': raise NotImplementedError('non-English NLP is not ready yet, sorry') n_sents = doc.n_sents words = doc.words(filter_punct=True) n_words = len(words) n_unique_words = len({word.lower for word in words}) n_chars = sum(len(word) for word in words) hyphenator = data.load_hyphenator(lang='en') syllables_per_word = [len(hyphenator.positions(word.lower_)) + 1 for word in words] n_syllables = sum(syllables_per_word) n_polysyllable_words = sum(1 for n in syllables_per_word if n >= 3) return {'n_sents': n_sents, 'n_words': n_words, 'n_unique_words': n_unique_words, 'n_chars': n_chars, 'n_syllables': n_syllables, 'n_polysyllable_words': n_polysyllable_words, 'flesch_kincaid_grade_level': flesch_kincaid_grade_level(n_syllables, n_words, n_sents), 'flesch_readability_ease': flesch_readability_ease(n_syllables, n_words, n_sents), 'smog_index': smog_index(n_polysyllable_words, n_sents), 'gunning_fog_index': gunning_fog_index(n_words, n_polysyllable_words, n_sents), 'coleman_liau_index': coleman_liau_index(n_chars, n_words, n_sents), 'automated_readability_index': automated_readability_index(n_chars, n_words, n_sents)} def flesch_kincaid_grade_level(n_syllables, n_words, n_sents): """https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch.E2.80.93Kincaid_grade_level""" return 11.8 * (n_syllables / n_words) + 0.39 * (n_words / n_sents) - 15.59 def flesch_readability_ease(n_syllables, n_words, n_sents): """https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease""" return -84.6 * (n_syllables / n_words) - 1.015 * (n_words / n_sents) + 206.835 def smog_index(n_polysyllable_words, n_sents, verbose=False): """https://en.wikipedia.org/wiki/SMOG""" if verbose and n_sents < 30: print('**WARNING: SMOG score may be unreliable for n_sents < 30') return 1.0430 * sqrt(30 * (n_polysyllable_words / n_sents)) + 3.1291 def gunning_fog_index(n_words, n_polysyllable_words, n_sents): """https://en.wikipedia.org/wiki/Gunning_fog_index""" return 0.4 * ((n_words / n_sents) + 100 * (n_polysyllable_words / n_words)) def coleman_liau_index(n_chars, n_words, n_sents): """https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index""" return 5.879851 * (n_chars / n_words) - 29.587280 * (n_sents / n_words) - 15.800804 def automated_readability_index(n_chars, n_words, n_sents): """https://en.wikipedia.org/wiki/Automated_readability_index""" return 4.71 * (n_chars / n_words) + 0.5 * (n_words / n_sents) - 21.43 PKUDHA088textacy/text_utils.py""" Set of small utility functions that take text strings as input. """ from __future__ import absolute_import, division, print_function, unicode_literals import re from cld2 import detect as cld2_detect from textacy.regexes_etc import ACRONYM_REGEX def is_acronym(token, exclude=None): """ Pass single token as a string, return True/False if is/is not valid acronym. Args: token (str): single word to check for acronym-ness exclude (set[str]): if technically valid but not actually good acronyms are known in advance, pass them in as a set of strings; matching tokens will return False Returns: bool """ # exclude certain valid acronyms from consideration if exclude and token in exclude: return False # don't allow empty strings if not token: return False # don't allow spaces if ' ' in token: return False # 2-character acronyms can't have lower-case letters if len(token) == 2 and not token.isupper(): return False # acronyms can't be all digits if token.isdigit(): return False # acronyms must have at least one upper-case letter or start/end with a digit if (not any(char.isupper() for char in token) and not (token[0].isdigit() or token[-1].isdigit())): return False # acronyms must have between 2 and 10 alphanumeric characters if not 2 <= sum(1 for char in token if char.isalnum()) <= 10: return False # only certain combinations of letters, digits, and '&/.-' allowed if not ACRONYM_REGEX.match(token): return False return True def detect_language(text): """ Detect the most likely language of a text and return its 2-letter code (see https://cloud.google.com/translate/v2/using_rest#language-params). Uses the `cld2-cffi` package (https://pypi.python.org/pypi/cld2-cffi); to take advantage of optional params, call :func:`cld2.detect()` directly. Args: text (str) Returns: str """ is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True) if is_reliable is False: msg = '**WARNING: Text language detected with low confidence; best guesses: {}' print(msg.format(best_guesses)) return best_guesses[0][1] def keyword_in_context(text, keyword, ignore_case=True, window_width=50, print_only=True): """ Search for `keyword` in `text` via regular expression, return or print strings spanning `window_width` characters before and after each occurrence of keyword. Args: text (str): text in which to search for `keyword` keyword (str): technically, any valid regular expression string should work, but usually this is a single word or short phrase: "spam", "spam and eggs"; to account for variations, use regex: "[Ss]pam (and|&) [Ee]ggs?" N.B. If keyword contains special characters, be sure to escape them!!! ignore_case (bool, optional): if True, ignore letter case in `keyword` matching window_width (int, optional): number of characters on either side of `keyword` to include as "context" print_only (bool, optional): if True, print out all results with nice formatting; if False, return all (pre, kw, post) matches as generator of raw strings Returns: generator(tuple(str, str, str)), or None """ flags = re.IGNORECASE if ignore_case is True else 0 if print_only is True: for match in re.finditer(keyword, text, flags=flags): print('{pre} {kw} {post}'.format( pre=text[max(0, match.start() - window_width): match.start()].rjust(window_width), kw=match.group(), post=text[match.end(): match.end() + window_width].ljust(window_width))) else: return ((text[max(0, match.start() - window_width): match.start()], match.group(), text[match.end(): match.end() + window_width]) for match in re.finditer(keyword, text, flags=flags)) PKZDHA\\textacy/texts.py""" Object-oriented interface for processing individual text documents as well as collections (corpora). Wraps other modules' functionality with some amount of caching, for efficiency. """ from __future__ import absolute_import, division, print_function, unicode_literals import copy import re from cachetools import cachedmethod, LRUCache, hashkey from collections import Counter from functools import partial from operator import attrgetter from spacy.tokens.token import Token as spacy_token from spacy.tokens.span import Span as spacy_span from textacy import (data, extract, spacy_utils, text_stats, text_utils, transform, keyterms) class TextDoc(object): """ Class that tokenizes, tags, and parses a text document, and provides an easy interface to information extraction, alternative document representations, and statistical measures of the text. Args: text (str) spacy_pipeline (:class:`spacy..()`, optional) lang (str, optional) metadata (dict, optional) """ # initialize as class attribute so they're shared across instances spacy_pipeline = None spacy_vocab = None spacy_stringstore = None def __init__(self, text, spacy_pipeline=None, lang='auto', metadata=None, max_cachesize=5): self.lang = text_utils.detect_language(text) if lang == 'auto' else lang self.metadata = {} if metadata is None else metadata self.spacy_pipeline = data.load_spacy_pipeline(lang=self.lang) \ if spacy_pipeline is None else spacy_pipeline self.spacy_vocab = self.spacy_pipeline.vocab self.spacy_stringstore = self.spacy_vocab.strings self.spacy_doc = self.spacy_pipeline(text) self._term_counts = Counter() self._cache = LRUCache(max_cachesize) def __repr__(self): return 'TextDoc({} tokens)'.format(self.n_tokens) def __iter__(self): for tok in self.spacy_doc: yield tok def __len__(self): return self.n_tokens def __getitem__(self, index): return self.spacy_doc[index] @property def sents(self): """Yield the document's sentences as segmented by spacy.""" for sent in self.spacy_doc.sents: yield sent ############### # DOC AS TEXT # @property def text(self): """Return the document's raw text.""" return self.spacy_doc.string @property def tokenized_text(self): """Return text as an ordered, nested list of tokens per sentence.""" return [[token.text for token in sent] for sent in self.spacy_doc.sents] @property def pos_tagged_text(self): """Return text as an ordered, nested list of (token, POS) pairs per sentence.""" return [[(token.text, token.pos_) for token in sent] for sent in self.spacy_doc.sents] ####################### # DOC REPRESENTATIONS # def as_bag_of_terms(self, weighting='tf', normalized=True, binary=False, idf=None, lemmatize='auto', ngram_range=(1, 1), include_nes=False, include_nps=False, include_kts=False): """ Represent doc as a "bag of terms", an unordered set of (term id, term weight) pairs, where term weight may be by TF or TF*IDF. Args: weighting (str {'tf', 'tfidf'}, optional): weighting of term weights, either term frequency ('tf') or tf * inverse doc frequency ('tfidf') idf (dict, optional): if `weighting` = 'tfidf', idf's must be supplied externally, such as from a `TextCorpus` object lemmatize (bool or 'auto', optional): if True, lemmatize all terms when getting their frequencies ngram_range (tuple(int), optional): (min n, max n) values for n-grams to include in terms list; default (1, 1) only includes unigrams include_nes (bool, optional): if True, include named entities in terms list include_nps (bool, optional): if True, include noun phrases in terms list include_kts (bool, optional): if True, include key terms in terms list normalized (bool, optional): if True, normalize term freqs by the total number of unique terms binary (bool optional): if True, set all (non-zero) term freqs equal to 1 Returns: :class:`collections.Counter()`: mapping of term ids to corresponding term weights """ term_weights = self.term_counts( lemmatize=lemmatize, ngram_range=ngram_range, include_nes=include_nes, include_nps=include_nps, include_kts=include_kts) if binary is True: term_weights = Counter({key: 1 for key in term_weights.keys()}) elif normalized is True: # n_terms = sum(term_freqs.values()) n_tokens = self.n_tokens term_weights = Counter({key: val / n_tokens for key, val in term_weights.items()}) if weighting == 'tfidf' and idf: term_weights = Counter({key: val * idf[key] for key, val in term_weights.items()}) return term_weights def as_bag_of_concepts(self): raise NotImplementedError() def as_semantic_network(self): raise NotImplementedError() ########################## # INFORMATION EXTRACTION # @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'words')) def words(self, **kwargs): """ Extract an ordered list of words from a spacy-parsed doc, optionally filtering words by part-of-speech (etc.) and frequency. See :func:`extract.words()` for all function kwargs. """ return extract.words(self.spacy_doc, **kwargs) @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'ngrams')) def ngrams(self, n, **kwargs): """ Extract an ordered list of n-grams (`n` consecutive words) from doc, optionally filtering n-grams by the types and parts-of-speech of the constituent words. Args: n (int): number of tokens to include in n-grams; 1 => unigrams, 2 => bigrams See :func:`extract.ngrams()` for all function kwargs. """ return extract.ngrams(self.spacy_doc, n, **kwargs) @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'named_entities')) def named_entities(self, **kwargs): """ Extract an ordered list of named entities (PERSON, ORG, LOC, etc.) from doc, optionally filtering by the entity types and frequencies. See :func:`extract.named_entities()` for all function kwargs. """ return extract.named_entities(self.spacy_doc, **kwargs) @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'noun_phrases')) def noun_phrases(self, **kwargs): """ Extract an ordered list of noun phrases from doc, optionally filtering by frequency and dropping leading determiners. See :func:`extract.noun_phrases()` for all function kwargs. """ return extract.noun_phrases(self.spacy_doc, **kwargs) @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'pos_regex_matches')) def pos_regex_matches(self, pattern): """ Extract sequences of consecutive tokens from a spacy-parsed doc whose part-of-speech tags match the specified regex pattern. Args: pattern (str): Pattern of consecutive POS tags whose corresponding words are to be extracted, inspired by the regex patterns used in NLTK's `nltk.chunk.regexp`. Tags are uppercase, from the universal tag set; delimited by < and >, which are basically converted to parentheses with spaces as needed to correctly extract matching word sequences; white space in the input doesn't matter. Examples (see `regexes_etc.POS_REGEX_PATTERNS`): * noun phrase: r'? (+ )* +' * compound nouns: r'+' * verb phrase: r'?*+' * prepositional phrase: r' ? (+)* +' See :func:`data.get_pos_regex_pattern()` for common examples. """ return extract.pos_regex_matches(self.spacy_doc, pattern) @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'subject_verb_object_triples')) def subject_verb_object_triples(self): """ Extract an *un*ordered list of distinct subject-verb-object (SVO) triples from doc. """ return extract.subject_verb_object_triples(self.spacy_doc) @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'acronyms_and_definitions')) def acronyms_and_definitions(self, **kwargs): """ Extract a collection of acronyms and their most likely definitions, if available, from doc. If multiple definitions are found for a given acronym, only the most frequently occurring definition is returned. See :func:`extract.acronyms_and_definitions()` for all function kwargs. """ return extract.acronyms_and_definitions(self.spacy_doc, **kwargs) @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'semistructured_statements')) def semistructured_statements(self, entity, **kwargs): """ Extract "semi-structured statements" from doc, each as a (entity, cue, fragment) triple. This is similar to subject-verb-object triples. Args: entity (str): a noun or noun phrase of some sort (e.g. "President Obama", "global warming", "Python") See :func:`extract.semistructured_statements()` for all function kwargs. """ return extract.semistructured_statements( self.spacy_doc, entity, **kwargs) @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'direct_quotations')) def direct_quotations(self): """ Baseline, not-great attempt at direction quotation extraction (no indirect or mixed quotations) using rules and patterns. English only. """ return extract.direct_quotations(self.spacy_doc) @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'key_terms')) def key_terms(self, algorithm='sgrank', n=10): """ Extract key terms from a document using `algorithm`. Args: algorithm (str {'sgrank', 'textrank', 'singlerank'}, optional): name of algorithm to use for key term extraction n (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0.0, 1.0), representing the fraction of top-ranked terms to return as keyterms Raises: ValueError: if `algorithm` not in {'sgrank', 'textrank', 'singlerank'} """ if algorithm == 'sgrank': return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n) elif algorithm == 'textrank': return keyterms.textrank(self.spacy_doc, n_keyterms=n) elif algorithm == 'singlerank': return keyterms.singlerank(self.spacy_doc, n_keyterms=n) else: raise ValueError('algorithm {} not a valid option'.format(algorithm)) ############## # STATISTICS # @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'term_counts')) def term_counts(self, lemmatize='auto', ngram_range=(1, 1), include_nes=False, include_nps=False, include_kts=False): """ Get the number of occurrences ("counts") of each unique term in doc; terms may be words, n-grams, named entities, noun phrases, and key terms. Args: lemmatize (bool or 'auto', optional): if True, lemmatize all terms when getting their frequencies; if 'auto', lemmatize all terms that aren't proper nouns or acronyms ngram_range (tuple(int), optional): (min n, max n) values for n-grams to include in terms list; default (1, 1) only includes unigrams include_nes (bool, optional): if True, include named entities in terms list include_nps (bool, optional): if True, include noun phrases in terms list include_kts (bool, optional): if True, include key terms in terms list Returns: :class:`collections.Counter()`: mapping of unique term ids to corresponding term counts """ if lemmatize == 'auto': get_id = lambda x: self.spacy_stringstore[spacy_utils.normalized_str(x)] elif lemmatize is True: get_id = lambda x: self.spacy_stringstore[x.lemma_] else: get_id = lambda x: self.spacy_stringstore[x.text] for n in range(ngram_range[0], ngram_range[1] + 1): if n == 1: self._term_counts = self._term_counts | Counter( get_id(word) for word in self.words()) else: self._term_counts = self._term_counts | Counter( get_id(ngram) for ngram in self.ngrams(n)) if include_nes is True: self._term_counts = self._term_counts | Counter( get_id(ne) for ne in self.named_entities()) if include_nps is True: self._term_counts = self._term_counts | Counter( get_id(np) for np in self.noun_phrases()) if include_kts is True: # HACK: key terms are currently returned as strings # TODO: cache key terms, and return them as spacy spans get_id = lambda x: self.spacy_stringstore[x] self._term_counts = self._term_counts | Counter( get_id(kt) for kt, _ in self.key_terms()) return self._term_counts def term_count(self, term): """ Get the number of occurrences ("count") of term in doc. Args: term (str or `spacy.Token` or `spacy.Span`) Returns: int """ # figure out what object we're dealing with here; convert as necessary if isinstance(term, str): term_text = term term_id = self.spacy_stringstore[term_text] term_len = term_text.count(' ') + 1 elif isinstance(term, spacy_token): term_text = spacy_utils.normalized_str(term) term_id = self.spacy_stringstore[term_text] term_len = 1 elif isinstance(term, spacy_span): term_text = spacy_utils.normalized_str(term) term_id = self.spacy_stringstore[term_text] term_len = len(term) term_count_ = self._term_counts[term_id] if term_count_ > 0: return term_count_ # have we not already counted the appropriate `n` n-grams? if not any(self.spacy_stringstore[t].count(' ') == term_len for t in self._term_counts): get_id = lambda x: self.spacy_stringstore[spacy_utils.normalized_str(x)] if term_len == 1: self._term_counts += Counter(get_id(w) for w in self.words()) else: self._term_counts += Counter(get_id(ng) for ng in self.ngrams(term_len)) term_count_ = self._term_counts[term_id] if term_count_ > 0: return term_count_ # last resort: try a regular expression return sum(1 for _ in re.finditer(re.escape(term_text), self.text)) @property def n_tokens(self): """The number of tokens in the document -- including punctuation.""" return len(self.spacy_doc) def n_words(self, filter_stops=False, filter_punct=True, filter_nums=False): """ The number of words in the document, with optional filtering of stop words, punctuation (on by default), and numbers. """ return len(self.words(filter_stops=filter_stops, filter_punct=filter_punct, filter_nums=filter_nums)) @property def n_sents(self): """The number of sentences in the document.""" return sum(1 for _ in self.spacy_doc.sents) def n_paragraphs(self, pattern=r'\n\n+'): """The number of paragraphs in the document, as delimited by `pattern`.""" return sum(1 for _ in re.finditer(pattern, self.text)) + 1 @cachedmethod(attrgetter('_cache'), key=partial(hashkey, 'readability_stats')) def readability_stats(self): return text_stats.readability_stats(self) class TextCorpus(object): """ A collection of :class:`TextDoc`s with some syntactic sugar and functions to compute corpus statistics. Initalize with a particular language (English by default). Add documents to corpus by `TextCorpus.add_text()`. Iterate over corpus docs with `for doc in TextCorpus`. Access individual docs by index (e.g. `TextCorpus[0]` or `TextCorpus[0:10]`) or by boolean condition specified by lambda function (e.g. `TextCorpus.get_docs(lambda x: len(x) > 100)`). """ def __init__(self, lang='en'): self.lang = lang self.spacy_pipeline = data.load_spacy_pipeline(lang=self.lang) self.spacy_vocab = self.spacy_pipeline.vocab self.spacy_stringstore = self.spacy_vocab.strings self.docs = [] self.n_docs = 0 self.n_tokens = 0 def __repr__(self): return 'TextCorpus({} docs, {} tokens)'.format(self.n_docs, self.n_tokens) def __iter__(self): for doc in self.docs: yield doc def __len__(self): return self.n_docs def __getitem__(self, index): return self.docs[index] @classmethod def from_texts(cls, texts, lang='en'): """ Convenience function for creating a `TextCorpus` from an iterable of text strings. NOTE: Only useful for texts without additional metadata. Args: texts (iterable(str)) lang (str, optional) Returns: :class:`TextCorpus` """ textcorpus = cls(lang=lang) for text in texts: textcorpus.add_text(text) return textcorpus def add_text(self, text, lang='en', metadata=None): """ Create a :class:`TextDoc` from `text` and `metadata`, then add it to the corpus. Args: text (str): raw text document to add to corpus as newly instantiated `TextDoc` lang (str, optional): metadata (dict, optional): dictionary of document metadata, such as:: {"title": "My Great Doc", "author": "Burton DeWilde"} NOTE: may be useful for retrieval via :func:`get_docs()`, e.g. `TextCorpus.get_docs(lambda x: x.metadata["title"] == "My Great Doc")` """ doc = TextDoc(text, spacy_pipeline=self.spacy_pipeline, lang=lang, metadata=metadata) doc.corpus_index = self.n_docs doc.corpus = self self.docs.append(doc) self.n_docs += 1 self.n_tokens += doc.n_tokens def add_doc(self, textdoc, print_warning=True): """ Add an existing :class:`TextDoc` to the corpus as-is. NB: If `textdoc` is already added to this or another `TextCorpus`, a warning message will be printed and the `corpus_index` attribute will be overwritten, but you won't be prevented from adding the doc. Args: textdoc ():class:`TextDoc`) print_warning (bool, optional): if True, print a warning message if `textdoc` already added to a corpus; otherwise, don't ever print the warning and live dangerously """ if hasattr(textdoc, 'corpus_index'): textdoc = copy.deepcopy(textdoc) if print_warning is True: print('**WARNING: TextDoc already associated with a TextCorpus; adding anyway...') textdoc.corpus_index = self.n_docs textdoc.corpus = self self.docs.append(textdoc) self.n_docs += 1 self.n_tokens += textdoc.n_tokens def get_docs(self, match_condition, limit=None): """ Iterate over all docs in corpus and return all (or N=`limit`) for which `match_condition(doc) is True`. Args: match_condition (func): function that operates on a :class:`TextDoc` and returns a boolean value; e.g. `lambda x: len(x) > 100` matches all docs with more than 100 tokens limit (int, optional): if not `None`, maximum number of matched docs to return Yields: class:`TextDoc`: one per doc passing `match_condition` up to `limit` docs """ if limit is None: for doc in self: if match_condition(doc) is True: yield doc else: n_matched_docs = 0 for doc in self: if match_condition(doc) is True: n_matched_docs += 1 if n_matched_docs > limit: break yield doc def remove_doc(self, index): """Remove the document at `index` from the corpus, and decrement the `corpus_index` attribute on all docs that come after it in the corpus.""" for doc in self[index + 1:]: doc.corpus_index -= 1 del self[index] def remove_docs(self, match_condition, limit=None): """ Remove all (or N=`limit`) docs in corpus for which `match_condition(doc) is True`. Re-set all remaining doc' `corpus_index` attributes at the end. Args: match_condition (func): function that operates on a :class:`TextDoc` and returns a boolean value; e.g. `lambda x: len(x) > 100` matches all docs with more than 100 tokens limit (int, optional): if not `None`, maximum number of matched docs to remove """ remove_indexes = [doc.corpus_index for doc in self.get_docs(match_condition, limit=limit)] for index in remove_indexes: del self[index] # now let's re-set the `corpus_index` attribute for all docs at once for i, doc in enumerate(self): doc.corpus_index = i def to_term_doc_matrix(self, weighting='tf', normalize=True, binarize=False, smooth_idf=True, min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None, ngram_range=(1, 1), include_nes=False, include_nps=False, include_kts=False): """ Transform corpus into a sparse CSR matrix, where each row i corresponds to a doc, each column j corresponds to a unique term, and matrix values (i, j) correspond to the tf or tf-idf weighting of term j in doc i. """ return transform.corpus_to_term_doc_matrix( self, weighting=weighting, normalize=normalize, binarize=binarize, smooth_idf=smooth_idf, min_df=min_df, max_df=max_df, min_ic=min_ic, max_n_terms=max_n_terms, ngram_range=ngram_range, include_nes=include_nes, include_nps=include_nps, include_kts=include_kts) PKaDH_--textacy/transform.py""" Functions to transform documents and corpora into other representations, including term-document matrices, semantic networks, and ... WIP. """ import itertools import networkx as nx import numpy as np from collections import defaultdict from cytoolz import itertoolz from scipy import sparse from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.preprocessing import binarize as binarize_mat from sklearn.preprocessing import normalize as normalize_mat from spacy import attrs from spacy.tokens.span import Span as spacy_span from spacy.tokens.token import Token as spacy_token from textacy import extract, text_stats from textacy.spacy_utils import normalized_str # TODO: bag-of-words? bag-of-concepts? gensim-compatible corpus and dictionary? def terms_to_semantic_network(terms, window_width=10, edge_weighting='cooc_freq'): """ Convert an ordered list of non-overlapping terms into a semantic network, where each terms is represented by a node with edges linking it to other terms that co-occur within `window_width` terms of itself. Args: terms (list(str) or list(:class:`spacy.Token()`)) window_width (int, optional): size of sliding window over `terms` that determines which are said to co-occur; if = 2, only adjacent terms will have edges in network edge_weighting (str {'cooc_freq', 'binary'}, optional): if 'binary', all co-occurring terms will have network edges with weight = 1; if 'cooc_freq', edges will have a weight equal to the number of times that the connected nodes co-occur in a sliding window Returns: :class:`networkx.Graph()` Notes: * Be sure to filter out stopwords, punctuation, certain parts of speech, etc. from the terms list before passing it to this function * Multi-word terms, such as named entities and compound nouns, must be merged into single strings or spacy.Tokens beforehand * If terms are already strings, be sure to normalize so that like terms are counted together (see :func:`spacy_utils.normalized_str`) """ if window_width < 2: raise ValueError('Window width must be >= 2.') if isinstance(terms[0], str): windows = itertoolz.sliding_window(window_width, terms) elif isinstance(terms[0], spacy_token): windows = ((normalized_str(tok) for tok in window) for window in itertoolz.sliding_window(window_width, terms)) else: msg = 'Input terms must be strings or spacy Tokens, not {}.'.format(type(terms[0])) raise TypeError(msg) graph = nx.Graph() if edge_weighting == 'cooc_freq': cooc_mat = defaultdict(lambda: defaultdict(int)) for window in windows: for w1, w2 in itertools.combinations(sorted(window), 2): cooc_mat[w1][w2] += 1 graph.add_edges_from( (w1, w2, {'weight': cooc_mat[w1][w2]}) for w1, w2s in cooc_mat.items() for w2 in w2s) elif edge_weighting == 'binary': graph.add_edges_from( w1_w2 for window in windows for w1_w2 in itertools.combinations(window, 2)) return graph def sents_to_semantic_network(sents, edge_weighting='cosine'): """ Convert a list of sentences into a semantic network, where each sentence is represented by a node with edges linking it to other sentences weighted by the (cosine or jaccard) similarity of their constituent words. Args: sents (list(str) or list(:class:`spacy.Span`)) edge_weighting (str {'cosine', 'jaccard'}, optional): similarity metric to use for weighting edges between sentences; if 'cosine', use the cosine similarity between sentences represented as tf-idf word vectors; if 'jaccard', use the set intersection divided by the set union of all words in a given sentence pair Returns: :class:`networkx.Graph()`: nodes are the integer indexes of the sentences in the input `sents` list, _not_ the actual text of the sentences! Notes: * If passing sentences as strings, be sure to filter out stopwords, punctuation, certain parts of speech, etc. beforehand * Consider normalizing the strings so that like terms are counted together (see :func:`spacy_utils.normalized_str`) """ n_sents = len(sents) if isinstance(sents[0], str): pass elif isinstance(sents[0], spacy_span): sents = [' '.join(normalized_str(tok) for tok in extract.words(sent, filter_stops=True, filter_punct=True, filter_nums=False)) for sent in sents] else: msg = 'Input sents must be strings or spacy Spans, not {}.'.format(type(sents[0])) raise TypeError(msg) if edge_weighting == 'cosine': term_sent_matrix = TfidfVectorizer().fit_transform(sents) elif edge_weighting == 'jaccard': term_sent_matrix = CountVectorizer(binary=True).fit_transform(sents) weights = (term_sent_matrix * term_sent_matrix.T).A.tolist() graph = nx.Graph() graph.add_edges_from( (i, j, {'weight': weights[i][j]}) for i in range(n_sents) for j in range(i + 1, n_sents)) return graph def corpus_to_term_doc_matrix(corpus, weighting='tf', normalize=True, binarize=False, smooth_idf=True, min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None, ngram_range=(1, 1), include_nes=False, include_nps=False, include_kts=False): """ Transform a collection of spacy docs (`corpus`) into a sparse CSR matrix, where each row i corresponds to a doc, each column j corresponds to a unique term. Args: corpus (:class:`texts.TextCorpus`) weighting (str {'tf', 'tfidf'}, optional): if 'tf', matrix values (i, j) correspond to the number of occurrences of term j in doc i; if 'tfidf', term frequencies (tf) are multiplied by their corresponding inverse document frequencies (idf) normalize (bool, optional): if True, normalize term frequencies by the L1 norms of the vectors binarize (bool, optional): if True, set all term frequencies greater than 0 equal to 1 smooth_idf (bool, optional): if True, add 1 to all document frequencies, equivalent to adding a single document to the corpus containing every unique term min_df (float or int, optional): if float, value is the fractional proportion of the total number of documents, which must be in [0.0, 1.0]; if int, value is the absolute number; filter terms whose document frequency is less than `min_df` max_df (float or int, optional): if float, value is the fractional proportion of the total number of documents, which must be in [0.0, 1.0]; if int, value is the absolute number; filter terms whose document frequency is greater than `max_df` min_ic (float, optional): filter terms whose information content is less than `min_ic`; value must be in [0.0, 1.0] max_n_terms (int, optional): only include terms whose document frequency is within the top `max_n_terms` ngram_range (tuple(int, int), optional): range of ngrams to include as terms; default is unigrams only include_nes (bool, optional): if True, include named entities as terms (columns) in the matrix include_nps (bool, optional): if True, include noun phrases as terms (columns) in the matrix include_kts (bool, optional): if True, include SGRank key terms as terms (columns) in the matrix Returns: tuple(:class:`scipy.sparse.csr_matrix`, dict): 2-tuple of a weighted `term_doc_matrix` (an N X M matrix, where N is the # of docs, M is the # of unique terms, and value (n, m) is the weight of term m in doc n) and an `id_to_term` mapping (dict with unique integer term identifiers as keys and the corresponding normalized strings as values) """ id_to_term = defaultdict() id_to_term.default_factory = id_to_term.__len__ rows = [] cols = [] data = [] for doc in corpus: term_counts = doc.term_counts(ngram_range=ngram_range, include_nes=include_nes, include_nps=include_nps, include_kts=include_kts) row = doc.corpus_index rows.extend(itertools.repeat(row, times=len(term_counts))) cols.extend(id_to_term[key] for key in term_counts.keys()) data.extend(term_counts.values()) id_to_term = {val: corpus.spacy_stringstore[key] for key, val in id_to_term.items()} term_doc_matrix = sparse.coo_matrix((data, (rows, cols)), dtype=np.float64).tocsr() # filter terms by document frequency? if max_df != 1.0 or min_df != 1 or max_n_terms is not None: term_doc_matrix, id_to_term = text_stats.filter_terms_by_df( term_doc_matrix, id_to_term, max_df=max_df, min_df=min_df, max_n_terms=max_n_terms) if min_ic != 0.0: term_doc_matrix, id_to_term = text_stats.filter_terms_by_ic( term_doc_matrix, id_to_term, min_ic=min_ic, max_n_terms=max_n_terms) if normalize is True: term_doc_matrix = normalize_mat(term_doc_matrix, norm='l1', axis=1, copy=False) elif binarize is True: term_doc_matrix = binarize_mat(term_doc_matrix, threshold=0.0, copy=False) if weighting == 'tfidf': dfs = text_stats.get_doc_freqs(term_doc_matrix, normalized=False) if smooth_idf is True: n_docs = term_doc_matrix.shape[0] + 1 dfs += 1 idfs = np.log(n_docs / dfs) + 1.0 term_doc_matrix = term_doc_matrix.multiply(idfs) return term_doc_matrix, id_to_term def corpus_to_cscmatrix(corpus, lemmatize=False): """ Transform a list of spacy docs (`corpus`) into a sparse CSC matrix, where each row i corresponds to a unique term, each column j corresponds to a doc, and values (i, j) correspond to the number of occurrences of term i in doc j. Args: corpus (list(:class:`spacy.Doc()`)) lemmatize (bool, optional) Returns: :class:`scipy.sparse.csc_matrix` """ num_nnz = 0 data = [] indices = [] indptr = [0] for doc in corpus: if lemmatize is False: term_freqs = list(doc.count_by(attrs.ORTH).items()) else: term_freqs = list(doc.count_by(attrs.LEMMA).items()) indices.extend(term for term, _ in term_freqs) data.extend(freq for _, freq in term_freqs) num_nnz += len(term_freqs) indptr.append(num_nnz) num_terms = max(indices) + 1 if indices else 0 num_docs = len(indptr) - 1 data = np.asarray(data, dtype=np.int32) indices = np.asarray(indices) return sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=np.int32) PKEH<0p'textacy-0.1.1.dist-info/DESCRIPTION.rst## textacy: [tagline] - elevator pitch - key features outline - installation instructions / requirements - quick start / complete usage example - link(s) to full documentation ### Usage For a single text: ```python text = "Somewhere in la Mancha, in a place whose name I do not care to remember, a gentleman lived not long ago, one of those who has a lance and ancient shield on a shelf and keeps a skinny nag and a greyhound for racing." metadata = {"title": "Don Quixote", "author": "Miguel de Cervantes"} doc = TextDoc(text, metadata=metadata, lang="en") ``` For multiple texts: ```python texts = ["Many years later, as he faced the firing squad, Colonel Aureliano Buendía was to remember that distant afternoon when his father took him to discover ice.", "The universe (which others call the Library) is composed of an indefinite and perhaps infinite number of hexagonal galleries, with vast air shafts between, surrounded by very low railings."] corpus = TextCorpus.from_texts(texts, lang="en") ``` ### Maintainers - Burton DeWilde () ### TODOs - TODO: reduce dependencies on large external packages (e.g. pandas) - TODO: extract: return generators rather than lists? - TODO: texts: figure out what to do when documents are modified in-place (`doc.merge`) - TODO: texts: ^ related: when docs modified, erase cached_property attributes so they'll be re-caclulated - TODO: texts: ^related: update doc merge functions when Honnibal updates API - TODO: texts: what to do when new doc added to textcorpus does not have same language? - TODO: texts: have textdocs inherit `_term_doc_freqs` from textcorpus? - TODO: texts: add `doc_to_bag_of_terms()` func to transform? - TODO: transform: condense csc matrix by mapping stringstore term ints to incremented vals, starting at 0 - TODO: drop scipy dependency and switch to honnibal's own sparse matrices - TODO: preprocess: add basic tests for unidecode and ftfy functions PKEH&4ZZ%textacy-0.1.1.dist-info/metadata.json{"classifiers": ["Development Status :: 4 - Beta", "License :: OSI Approved :: Apache Software License", "Intended Audience :: Developers", "Programming Language :: Python", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Natural Language :: English", "Topic :: Text Processing :: Linguistic"], "extensions": {"python.details": {"contacts": [{"email": "burtondewilde@gmail.com", "name": "Burton DeWilde", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}, "project_urls": {"Home": "https://github.com/chartbeat-labs/textacy"}}}, "extras": [], "generator": "bdist_wheel (0.27.0)", "keywords": ["textacy", "spacy", "nlp", "text", "processing", "linguistics"], "license": "Apache", "metadata_version": "2.0", "name": "textacy", "run_requires": [{"requires": ["cachetools", "cld2-cffi", "cytoolz", "ftfy", "fuzzywuzzy", "networkx", "nltk", "numpy (>=1.8.0)", "pandas", "pyphen", "scikit-learn", "scipy", "spacy (>=0.100.0)", "unidecode"]}], "summary": "Higher-level text processing, built on Spacy", "version": "0.1.1"}PKEH\$%textacy-0.1.1.dist-info/top_level.txttextacy PKEHyCQ%nntextacy-0.1.1.dist-info/WHEELWheel-Version: 1.0 Generator: bdist_wheel (0.27.0) Root-Is-Purelib: true Tag: py2-none-any Tag: py3-none-any PKEH4N textacy-0.1.1.dist-info/METADATAMetadata-Version: 2.0 Name: textacy Version: 0.1.1 Summary: Higher-level text processing, built on Spacy Home-page: https://github.com/chartbeat-labs/textacy Author: Burton DeWilde Author-email: burtondewilde@gmail.com License: Apache Keywords: textacy,spacy,nlp,text processing,linguistics Platform: UNKNOWN Classifier: Development Status :: 4 - Beta Classifier: License :: OSI Approved :: Apache Software License Classifier: Intended Audience :: Developers Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 Classifier: Natural Language :: English Classifier: Topic :: Text Processing :: Linguistic Requires-Dist: cachetools Requires-Dist: cld2-cffi Requires-Dist: cytoolz Requires-Dist: ftfy Requires-Dist: fuzzywuzzy Requires-Dist: networkx Requires-Dist: nltk Requires-Dist: numpy (>=1.8.0) Requires-Dist: pandas Requires-Dist: pyphen Requires-Dist: scikit-learn Requires-Dist: scipy Requires-Dist: spacy (>=0.100.0) Requires-Dist: unidecode ## textacy: [tagline] - elevator pitch - key features outline - installation instructions / requirements - quick start / complete usage example - link(s) to full documentation ### Usage For a single text: ```python text = "Somewhere in la Mancha, in a place whose name I do not care to remember, a gentleman lived not long ago, one of those who has a lance and ancient shield on a shelf and keeps a skinny nag and a greyhound for racing." metadata = {"title": "Don Quixote", "author": "Miguel de Cervantes"} doc = TextDoc(text, metadata=metadata, lang="en") ``` For multiple texts: ```python texts = ["Many years later, as he faced the firing squad, Colonel Aureliano Buendía was to remember that distant afternoon when his father took him to discover ice.", "The universe (which others call the Library) is composed of an indefinite and perhaps infinite number of hexagonal galleries, with vast air shafts between, surrounded by very low railings."] corpus = TextCorpus.from_texts(texts, lang="en") ``` ### Maintainers - Burton DeWilde () ### TODOs - TODO: reduce dependencies on large external packages (e.g. pandas) - TODO: extract: return generators rather than lists? - TODO: texts: figure out what to do when documents are modified in-place (`doc.merge`) - TODO: texts: ^ related: when docs modified, erase cached_property attributes so they'll be re-caclulated - TODO: texts: ^related: update doc merge functions when Honnibal updates API - TODO: texts: what to do when new doc added to textcorpus does not have same language? - TODO: texts: have textdocs inherit `_term_doc_freqs` from textcorpus? - TODO: texts: add `doc_to_bag_of_terms()` func to transform? - TODO: transform: condense csc matrix by mapping stringstore term ints to incremented vals, starting at 0 - TODO: drop scipy dependency and switch to honnibal's own sparse matrices - TODO: preprocess: add basic tests for unidecode and ftfy functions PKEHLTS7textacy-0.1.1.dist-info/RECORDtextacy/__init__.py,sha256=Y-gYVt17PqKjdOz6l4w7i2QFjihQb4MH6NxtHOyGX9Y,427 textacy/data.py,sha256=pUHGVM9aO8jGxBeyLg4vT7j_8CV44D8ziI32DKIE_tk,4325 textacy/extract.py,sha256=1BolXS6m-bzyU9cJcyZOCmMOqQQKdSvss6OPBRKSPMo,28408 textacy/keyterms.py,sha256=IomFWu00xRyDT76pov2WWl_Q-tlW72nQvyF6y5xEVgU,24356 textacy/lexicon_methods.py,sha256=PBu_XnD3KTOS-rTNSSrUiyY_F1VYDS-oziY28llDBww,2162 textacy/math_utils.py,sha256=Ke87rIYVi5FJSDKXIdjh9yrp8nTao9wgBi2Po4SzJHE,1234 textacy/preprocess.py,sha256=1d_rlg4E0vi6NU4Qi8kUZJ_-NzFF1-b7RLUQjAyFmgE,9043 textacy/regexes_etc.py,sha256=u8PUbKoU25spdL_XQ9zARwO5GjuvJ6t5WITfmEhvEXQ,3941 textacy/spacy_utils.py,sha256=f5yl047p-Hyz7kzA4dRgle82HrAs3zK8J1HhVmt3QF4,5538 textacy/text_stats.py,sha256=gdjGgsNAkvLQ408I27Bft_QvF-B2sPImkmujouPkdV4,12304 textacy/text_utils.py,sha256=2YkBGBeymWs8PHv8YYzI5nPmCZAv5aer_l7Zxh7EHWA,4152 textacy/texts.py,sha256=niWNtCiEqvYsljJdYDBhzrMUTJNJJnwUCQIE7jRfzPo,23687 textacy/transform.py,sha256=0O2q7bCnFqrzKlz0KgDpAdSZydGTLzexPQbpC_V3uBU,11542 textacy-0.1.1.dist-info/DESCRIPTION.rst,sha256=7ZXXKT2z-BWc0wJRLd1EHcyg31pY3X7F9mvLsEe7780,1964 textacy-0.1.1.dist-info/METADATA,sha256=8xQPdYf8nCPn-ZhnZ2-SArFO4LJZM3S0zPAcRBZ66Jg,3054 textacy-0.1.1.dist-info/RECORD,, textacy-0.1.1.dist-info/WHEEL,sha256=22a9oDdbpVoTeukNLAOY0JJtL4OnhDR-_josH9ArXbA,110 textacy-0.1.1.dist-info/metadata.json,sha256=W9anwD85KmCax59ihYuCPw1UTWX2dKIGXJ05ddNkvpw,1114 textacy-0.1.1.dist-info/top_level.txt,sha256=0KI12Ce8PbJgeT0zueCfxJYO4wt_V7A4hlRGU5WEwz8,8 PK DHtextacy/__init__.pyPKGDH>textacy/data.pyPKDH[Mnntextacy/extract.pyPK