PK!{{corpy/__init__.py"""Top-level CorPy package. Refer to the documentation of the individual packages for details. """ __version__ = "0.2.0" PK!Z?corpy/morphodita/__init__.py"""Convenient and easy-to-use MorphoDiTa wrappers. """ import logging logging.basicConfig(level=logging.INFO) LOG = logging.getLogger(__package__) LOG.setLevel(logging.INFO) from .tokenizer import Tokenizer # noqa: E402, F401 from .tagger import Tagger # noqa: E402, F401 PK!ڔw corpy/morphodita/tagger.py"""An interface to MorphoDiTa taggers. """ from collections import namedtuple from collections.abc import Iterable from lazy import lazy from functools import lru_cache import ufal.morphodita as ufal from . import LOG Token = namedtuple("Token", "word lemma tag") class Tagger: """A MorphoDiTa morphological tagger and lemmatizer. :param tagger_path: Path to the pre-compiled tagging models to load. :type tagger_path: str """ _NO_TOKENIZER = ( "No tokenizer defined for tagger {!r}! Please provide " "pre-tokenized and sentence-split input." ) _TEXT_REQS = ( "Please provide a string or an iterable of iterables (not " "strings!) of strings as the ``text`` parameter." ) def __init__(self, tagger_path): self._tagger_path = tagger_path LOG.info("Loading tagger.") self._tagger = ufal.Tagger.load(tagger_path) if self._tagger is None: raise RuntimeError("Unable to load tagger from {!r}!".format(tagger_path)) self._morpho = self._tagger.getMorpho() self._has_tokenizer = self._tagger.newTokenizer() is not None if not self._has_tokenizer: LOG.warning(self._NO_TOKENIZER.format(tagger_path)) @lazy def _pdt_to_conll2009_converter(self): return ufal.TagsetConverter_newPdtToConll2009Converter() @lazy def _strip_lemma_comment_converter(self): return ufal.TagsetConverter_newStripLemmaCommentConverter(self._morpho) @lazy def _strip_lemma_id_converter(self): return ufal.TagsetConverter_newStripLemmaIdConverter(self._morpho) @lru_cache(maxsize=16) def _get_converter(self, convert): try: converter = ( getattr(self, "_" + convert + "_converter") if convert is not None else None ) except AttributeError as err: converters = [ a[1:-10] for a in dir(self) if "converter" in a and a != "_get_converter" ] raise ValueError( "Unknown converter {!r}. Available converters: " "{!r}.".format(convert, converters) ) from err return converter def tag(self, text, *, sents=False, guesser=False, convert=None): """Perform morphological tagging and lemmatization on text. If ``text`` is a string, sentence-split, tokenize and tag that string. If it's an iterable of iterables (typically a list of lists), then take each nested iterable as a separate sentence and tag it, honoring the provided sentence boundaries and tokenization. :param text: Input text. :type text: either str (tokenization is left to the tagger) or iterable of iterables (of str), representing individual sentences :param sents: Whether to signal sentence boundaries by outputting a sequence of lists (sentences). :type sents: bool :param guesser: Whether to use the morphological guesser provided with the tagger (if available). :type guesser: bool :param convert: Conversion strategy to apply to lemmas and / or tags before outputting them. :type convert: str, one of "pdt_to_conll2009", "strip_lemma_comment" or "strip_lemma_id", or None if no conversion is required :return: An iterator over the tagged text, possibly grouped into sentences if ``sents=True``. >>> tagger = Tagger("./czech-morfflex-pdt-161115.tagger") >>> from pprint import pprint >>> tokens = list(tagger.tag("Je zima. Bude sněžit.")) >>> pprint(tokens) [Token(word='Je', lemma='být', tag='VB-S---3P-AA---'), Token(word='zima', lemma='zima-1', tag='NNFS1-----A----'), Token(word='.', lemma='.', tag='Z:-------------'), Token(word='Bude', lemma='být', tag='VB-S---3F-AA---'), Token(word='sněžit', lemma='sněžit_:T', tag='Vf--------A----'), Token(word='.', lemma='.', tag='Z:-------------')] >>> tokens = list(tagger.tag([['Je', 'zima', '.'], ['Bude', 'sněžit', '.']])) >>> pprint(tokens) [Token(word='Je', lemma='být', tag='VB-S---3P-AA---'), Token(word='zima', lemma='zima-1', tag='NNFS1-----A----'), Token(word='.', lemma='.', tag='Z:-------------'), Token(word='Bude', lemma='být', tag='VB-S---3F-AA---'), Token(word='sněžit', lemma='sněžit_:T', tag='Vf--------A----'), Token(word='.', lemma='.', tag='Z:-------------')] >>> sents = list(tagger.tag("Je zima. Bude sněžit.", sents=True)) >>> pprint(sents) [[Token(word='Je', lemma='být', tag='VB-S---3P-AA---'), Token(word='zima', lemma='zima-1', tag='NNFS1-----A----'), Token(word='.', lemma='.', tag='Z:-------------')], [Token(word='Bude', lemma='být', tag='VB-S---3F-AA---'), Token(word='sněžit', lemma='sněžit_:T', tag='Vf--------A----'), Token(word='.', lemma='.', tag='Z:-------------')]] """ if isinstance(text, str): yield from self.tag_untokenized(text, sents, guesser, convert) # The other accepted type of input is an iterable of iterables of # strings, but we only do a partial check whether the top-level object # is an Iterable, because it would have to be consumed in order to # inspect its first item. A second check which signals the frequent # mistake of passing an iterable of strings (which results in tagging # each character separately) occurs in ``Tagger.tag_tokenized()``. elif isinstance(text, Iterable): yield from self.tag_tokenized(text, sents, guesser, convert) else: raise TypeError(self._TEXT_REQS) def tag_untokenized(self, text, sents=False, guesser=False, convert=None): """This is the method :meth:`tag` delegates to when `text` is a string. See docstring for :meth:`tag` for details about parameters. """ if not self._has_tokenizer: raise RuntimeError(self._NO_TOKENIZER.format(self._tagger_path)) tokenizer = self._tagger.newTokenizer() tokenizer.setText(text) converter = self._get_converter(convert) forms = ufal.Forms() tagged_lemmas = ufal.TaggedLemmas() token_ranges = ufal.TokenRanges() yield from self._tag( tokenizer, sents, guesser, converter, forms, tagged_lemmas, token_ranges ) def tag_tokenized(self, text, sents=False, guesser=False, convert=None): """This is the method :meth:`tag` delegates to when `text` is an iterable of iterables of strings. See docstring for :meth:`tag` for details about parameters. """ vtokenizer = ufal.Tokenizer_newVerticalTokenizer() converter = self._get_converter(convert) forms = ufal.Forms() tagged_lemmas = ufal.TaggedLemmas() token_ranges = ufal.TokenRanges() for sent in text: # refuse to process if sent is a string (because that would result # in tagging each character separately, which is nonsensical), or # more generally, not an iterable, if isinstance(sent, str) or not isinstance(sent, Iterable): raise TypeError(self._TEXT_REQS) vtokenizer.setText("\n".join(sent)) yield from self._tag( vtokenizer, sents, guesser, converter, forms, tagged_lemmas, token_ranges, ) def _tag( self, tokenizer, sents, guesser, converter, forms, tagged_lemmas, token_ranges ): while tokenizer.nextSentence(forms, token_ranges): self._tagger.tag(forms, tagged_lemmas, guesser) if sents: sent = [] for tagged_lemma, word in zip(tagged_lemmas, forms): if converter is not None: converter.convert(tagged_lemma) token = Token(word, tagged_lemma.lemma, tagged_lemma.tag) if sents: sent.append(token) else: yield token if sents: yield sent PK!/| | corpy/morphodita/tokenizer.py"""An interface to MorphoDiTa tokenizers. """ import ufal.morphodita as ufal from . import LOG class Tokenizer: """A wrapper API around the tokenizers offered by MorphoDiTa. :param tokenizer_type: Type of the requested tokenizer (cf. below for possible values). :type tokenizer_type: str `tokenizer_type` is typically one of: - ``"czech"``: a tokenizer tuned for Czech - ``"english"``: a tokenizer tuned for English - ``"generic"``: a generic tokenizer - ``"vertical"``: a simple tokenizer for the vertical format, which is effectively already tokenized (one word per line) Specifically, the available tokenizers are determined by the ``new_*_tokenizer`` static methods on the MorphoDiTa ``tokenizer`` class described in the `MorphoDiTa API reference `__. """ def __init__(self, tokenizer_type): constructor_name = "new" + tokenizer_type.capitalize() + "Tokenizer" self.tokenizer_constructor = getattr(ufal.Tokenizer, constructor_name) @staticmethod def from_tagger(tagger_path): """Load tokenizer associated with tagger file. """ self = Tokenizer("generic") LOG.info("Loading tagger.") tagger = ufal.Tagger.load(tagger_path) self.tokenizer_constructor = tagger.newTokenizer if self.tokenizer_constructor() is None: raise RuntimeError(f"The tagger {tagger_path} has no associated tokenizer.") return self def tokenize(self, text, sents=False): """Tokenize `text`. :param text: Text to tokenize. :type text: str :param sents: Whether to signal sentence boundaries by outputting a sequence of lists (sentences). :type sents: bool :return: An iterator over the tokenized text, possibly grouped into sentences if ``sents=True``. Note that MorphoDiTa performs both sentence splitting and tokenization at the same time, but this method iterates over tokens without sentence boundaries by default: >>> from corpy.morphodita import Tokenizer >>> t = Tokenizer("generic") >>> for word in t.tokenize("foo bar baz"): ... print(word) ... foo bar baz If you want to iterate over sentences (lists of tokens), set ``sents=True``: >>> for sentence in t.tokenize("foo bar baz", sents=True): ... print(sentence) ... ['foo', 'bar', 'baz'] """ # this is more elegant than just segfaulting in the MorphoDiTa C++ library if None is # passed... if not isinstance(text, str): raise TypeError( "``text`` should be a str, you passed in {}.".format(type(text)) ) forms = ufal.Forms() token_ranges = ufal.TokenRanges() tokenizer = self.tokenizer_constructor() tokenizer.setText(text) while tokenizer.nextSentence(forms, token_ranges): if sents: yield list(forms) else: yield from forms PK!?|8|8corpy/phonetics/cs.py"""Perform rule-based phonetic transcription of Czech. Some frequent exceptions to the otherwise fairly regular orthography-to-phonetics mapping are overridden using a pronunciation lexicon. """ import unicodedata as ud from functools import lru_cache from operator import itemgetter from pathlib import Path from typing import Dict, Iterable, List, Optional, Set, Tuple, Union # noqa: F401 import regex as re # ------------------------------ Utils ------------------------------ def _filter_comments(lines): for line in lines: if not line.strip().startswith("#"): yield line def _load_phones(tsv: str) -> Dict[str, Dict[str, str]]: ans: Dict[str, Dict[str, str]] = {} lines = tsv.splitlines() header_, lines = lines[0], lines[1:] header = [h.lower() for h in header_.split("\t")] header.pop(0) for line_ in _filter_comments(lines): line = line_.split("\t") key = line.pop(0) val = ans.setdefault(key, {}) for alphabet_id, symbol in zip(header, line): val[alphabet_id] = symbol return ans def _load_substr2phones(tsv: str, allowed: Dict) -> Dict[str, List[str]]: ans: Dict[str, List[str]] = {} lines = tsv.splitlines() lines.pop(0) for line in _filter_comments(lines): substr, phones = line.split("\t") phones_for_substr = ans.setdefault(substr, []) for phone in phones.split(): assert phone in allowed, f"Unexpected phone {phone!r}" phones_for_substr.append(phone) return ans def _load_voicing_pairs( tsv: str, allowed: Dict ) -> Tuple[Dict[str, str], Dict[str, str], Set[str], Set[str]]: devoiced2voiced, voiced2devoiced = {}, {} lines = tsv.splitlines() lines.pop(0) for line in _filter_comments(lines): devoiced, voiced = line.split("\t") assert devoiced in allowed, f"Unexpected phone {devoiced!r}" assert voiced in allowed, f"Unexpected phone {voiced!r}" devoiced2voiced[devoiced] = voiced voiced2devoiced[voiced] = devoiced trigger_voicing = set(voiced2devoiced.keys()) trigger_voicing.remove("v") trigger_voicing.remove("P\\") trigger_devoicing = set(devoiced2voiced.keys()) trigger_devoicing.remove("Q\\") return devoiced2voiced, voiced2devoiced, trigger_voicing, trigger_devoicing def _create_substr_re(substr_list: Iterable[str]) -> re.Regex: substr_list = sorted(substr_list, key=len, reverse=True) + ["."] return re.compile("|".join(substr_list)) class _ExceptionRewriter: def __init__(self, tsv: str) -> None: lines = tsv.splitlines() lines.pop(0) rules = [] for line in _filter_comments(lines): match, rewrite = line.split("\t") match = f"(?P{match})" if "(" not in match else match orig = re.search(r"\(\?P(.*?)\)", match).group(1) rules.append((match, orig, rewrite)) # reverse sort by length of substring matched, so that longest match applies rules.sort(key=itemgetter(1), reverse=True) re_str = "(" + "|".join(match for (match, _, _) in rules) + ")" self._re = re.compile(re_str) self._orig2rewrite: Dict[str, str] = { orig: rewrite for (_, orig, rewrite) in rules } @lru_cache() def _sub(self, string: str) -> str: self._at = 0 return self._re.sub(self._rewrite, string) def _rewrite(self, match) -> str: # entire match matched = match.group() # multiple rewrites are allowed, but they must be contiguous and start # at the beginning of the string; otherwise, the match is returned # unchanged if match.start() == self._at: self._at += len(matched) else: return matched # the part of the match we want to replace orig = match.group("x") # what we want to replace it with rewrite = self._orig2rewrite[orig] return matched.replace(orig, rewrite) # ------------------------------ Load config ------------------------------ DIR = Path(__file__) PHONES = _load_phones( DIR.with_name("phones.tsv").read_text(encoding="utf-8") # pylint: disable=E1101 ) SUBSTR2PHONES = _load_substr2phones( DIR.with_name("substr2phones.tsv").read_text( encoding="utf-8" ), # pylint: disable=E1101 PHONES, ) DEVOICED2VOICED, VOICED2DEVOICED, TRIGGER_VOICING, TRIGGER_DEVOICING = _load_voicing_pairs( DIR.with_name("voicing_pairs.tsv").read_text( encoding="utf-8" ), # pylint: disable=E1101 PHONES, ) SUBSTR_RE = _create_substr_re(SUBSTR2PHONES.keys()) REWRITER = _ExceptionRewriter( DIR.with_name("exceptions.tsv").read_text(encoding="utf-8") # pylint: disable=E1101 ) # ------------------------------ Public API ------------------------------ class Phone: """A single phone. You probably don't need to create these by hand, but they will be returned to you from :func:`transcribe`. """ def __init__(self, value: str, *, word_boundary: bool = False) -> None: self.value: str = value self.word_boundary = word_boundary def __repr__(self): return f"/{self.value}/" EMPTY_PHONE = Phone("") class ProsodicUnit: """A prosodic unit which should be transcribed as a whole. This means that various connected speech processes are emulated at word boundaries within the unit as well as within words. :param orthographic: The orthographic transcript of the prosodic unit. :type orthographic: list of str """ def __init__(self, orthographic: List[str]) -> None: self.orthographic = orthographic self._phonetic: Optional[List[Phone]] = None def phonetic( self, *, alphabet: str = "SAMPA", hiatus=False ) -> List[Tuple[str, ...]]: """Phonetic transcription of ProsodicUnit.""" if self._phonetic is None: trans = self._str2phones(self.orthographic) # CSPs are implemented in one reverse pass (assimilation of voicing # can propagate) and one forward pass trans = self._voicing_assim(trans) trans = self._other_csps(trans, hiatus=hiatus) self._phonetic = trans return self._split_words_and_translate(self._phonetic, alphabet) @staticmethod def _str2phones(input_: List[str]) -> List[Phone]: """Convert string to phones. Use pronunciation from dictionary if available, fall back to generic rewriting rules. """ output: List[Phone] = [] for word in input_: word = word.lower() # rewrite exceptions word = REWRITER._sub(word) # force hiatus in sequences; is there because the # exceptions above can insert it in place of to prevent # palatalization word = re.sub(r"([iy])([ií])", r"\1j\2", word) # remove duplicate graphemes (except for short vowels, cf. ) # cf. no gemination below for the phonetic counterpart of this rule word = re.sub(r"([^aeoiuy])\1", r"\1", word) for match in SUBSTR_RE.finditer(word.lower()): substr = match.group() try: phones = SUBSTR2PHONES[substr] except KeyError as err: raise ValueError( f"Unexpected substring in input: {substr!r}" ) from err output.extend(Phone(ph) for ph in phones) output[-1].word_boundary = True return output @staticmethod def _voicing_assim(input_: List[Phone]) -> List[Phone]: r"""Perform assimilation of voicing. Usually regressive, but P\ assimilates progressively as well. """ output = [] previous_phone = EMPTY_PHONE for phone in reversed(input_): if previous_phone.value in TRIGGER_VOICING: phone.value = DEVOICED2VOICED.get(phone.value, phone.value) elif phone.word_boundary or previous_phone.value in TRIGGER_DEVOICING: phone.value = VOICED2DEVOICED.get(phone.value, phone.value) # for P\, the assimilation works the other way round too elif previous_phone.value == "P\\" and phone.value in TRIGGER_DEVOICING: previous_phone.value = "Q\\" output.append(phone) previous_phone = phone output.reverse() return output @staticmethod def _other_csps(input_: List[Phone], *, hiatus=False) -> List[Phone]: """Perform other connected speech processes.""" output = [] for i, phone in enumerate(input_): try: next_ph = input_[i + 1] except IndexError: next_ph = EMPTY_PHONE # assimilation of place for nasals if phone.value == "n" and next_ph.value in ("k", "g"): phone.value = "N" elif phone.value == "m" and next_ph.value in ("f", "v"): phone.value = "F" # no gemination (except across word boundaries and for short # vowels); cf. remove duplicate graphemes above for the # orthographic counterpart of this rule elif ( phone.value == next_ph.value and phone.value not in "aEIou" and not phone.word_boundary ): continue # drop CSP-blocking pseudophones (they've done their job by now) elif phone.value == "-": continue output.append(phone) # optionally add transient /j/ between high front vowel and subsequent vowel if ( hiatus and re.match("[Ii]", phone.value) and re.match("[aEIoui]", next_ph.value) ): output.append(Phone("j")) return output @staticmethod def _split_words_and_translate( input_: List[Phone], alphabet ) -> List[Tuple[str, ...]]: output = [] word = [] alphabet = alphabet.lower() for phone in input_: word.append(PHONES.get(phone.value, {}).get(alphabet, phone.value)) if phone.word_boundary: output.append(tuple(word)) word = [] return output def _separate_tokens( tokens: List[str], prosodic_boundary_symbols: Set[str] ) -> Tuple[List[Optional[str]], List[str]]: """Separate tokens for transcription from those that will be left as is. Returns two lists: the first one is a matrix for the result containing non-alphabetic tokens and gaps for the alphabetic ones, the second one contains just the alphabetic ones. """ matrix: List[Optional[str]] = [] to_transcribe = [] for token in tokens: if re.fullmatch(r"[\p{Alphabetic}\-]*\p{Alphabetic}[\p{Alphabetic}\-]*", token): # instead of simply checking for a final hyphen in the outer # condition and silently shoving an otherwise transcribable token # into matrix, it's better to fail and alert the user they probably # meant something else if token.endswith("-"): raise ValueError( f"Can't transcribe token ending with hyphen ({token!r}), place hyphen at " "beginning of next token instead" ) to_transcribe.append(token) matrix.append(None) elif token in prosodic_boundary_symbols: to_transcribe.append("-") matrix.append(token) else: matrix.append(token) return matrix, to_transcribe def transcribe( phrase: Union[str, Iterable[str]], *, alphabet="sampa", hiatus=False, prosodic_boundary_symbols=set(), ) -> List[Union[str, Tuple[str, ...]]]: """Phonetically transcribe `phrase`. `phrase` is either a string (in which case it is split on whitespace) or an iterable of strings (in which case it's considered as already tokenized by the user). Transcription is attempted for tokens which consist purely of alphabetical characters and possibly hyphens (``-``). Other tokens are passed through unchanged. Hyphens have a special role: they prevent interactions between graphemes or phones from taking place, which means you can e.g. cancel assimilation of voicing in a cluster like ``tb`` by inserting a hyphen between the graphemes: ``t-b``. They are removed from the final output. If you want a **literal hyphen**, it must be inside a token with either no alphabetic characters, or at least one other non-alphabetic character (e.g. ``-``, ``---``, ``-hlad?``, etc.). Returns a list where **transcribed tokens** are represented as **tuples of strings** (phones) and **non-transcribed tokens** (which were just passed through as-is) as plain **strings**. `alphabet` is one of SAMPA, IPA, CS or CNC (case insensitive) and determines the symbol alphabet used in the phonetic transcript. When ``hiatus=True``, a /j/ phone is added between a high front vowel and a subsequent vowel. Various connected speech processes such as assimilation of voicing are emulated even across word boundaries. By default, this happens **irrespective of intervening non-transcribed tokens**. If you want some types of non-transcribed tokens to constitute an obstacle to interactions between phones, pass them as a set via the `prosodic_boundary_symbols` argument. E.g. ``prosodic_boundary_symbols={"?", ".."}`` will prevent CSPs from being emulated across ``?`` and ``..`` tokens. """ try: if isinstance(phrase, str): tokens = ud.normalize("NFC", phrase.strip()).split() else: tokens = [ud.normalize("NFC", t) for t in phrase] except TypeError as err: raise TypeError( f"Expected str or Iterable[str] as phrase argument, got {type(phrase)} instead" ) from err matrix, to_transcribe = _separate_tokens(tokens, prosodic_boundary_symbols) transcribed = ProsodicUnit(to_transcribe).phonetic(alphabet=alphabet, hiatus=hiatus) return [m if m is not None else transcribed.pop(0) for m in matrix] # type: ignore PK!nL''corpy/phonetics/exceptions.tsvMATCH REWRITE # HOWTO: if only a substring of the MATCH regex should be rewritten, # wrap it in a capturing group named x: (?P...); also useful if # you need to add anchors and other special characters around the # substring that should be rewritten akusti akusty augustin augustyn besti besty charakteristi charakterysty (?Pcyklisti). cyklisty destin destyn drasti drasty dynasti dynasty (?Pfantasti). fantasty festival festyval instinkt instynkt instit instyt investi investy justi justy logisti logisty mysti mysty (?Pnacionalisti). nacionalisty (?Pnacisti). nacisty prestiž prestyž palestin palestyn plasti plasty prostitu prostytu (?Prealisti). realisty (?Psarkasti). sarkasty sebastian sebastyan (?Pslávisti). slávisty (?Psocialisti). socialisty (?Pstatisti). statysty stimul stymul sugesti sugesty (?Pteroristi). teroristy textil textyl (?Pturisti). turisty vestibul vestybul alternativ alternatyv aromati aromaty (?Pautomati). automaty autoritativ autoritatyv battist battyst charitativ charitatyv (?Pdemokrati). demokraty dramati dramaty gramati gramaty informati informaty inovativ inovatyv inspirativ inspiratyv klimati klimaty kompatibil kompatybil konzervativ konzervatyv kreativ kreatyv kvalitativ kvalitatyv legislativ legislatyv lukrativ lukratyv matemati matematy matik matyk narativ naratyv negativ negatyv normativ normatyv operativ operatyv pneumatik pneumatyk privatiz privatyz problemati problematy provokativ provokatyv relativ relatyv reprezentativ reprezentatyv (?Pstati)[kcč] staty sympati sympaty systemati systematy temati tematy témati tématy vatikán vatykán dillí dylí dia dya die dye diferenc dyferenc digi dygi dikt dykt dim dym dip dyp diri dyri disci dysci disk dysk disp dysp distr dystr dividend dyvidend diviz dyviz diář dyář aktiv aktyv arkti arkty antarkti antarkty atrakti atrakty destrukti destrukty detekti detekty efektiv efektyv fakti fakty fiktiv fiktyv interaktiv interaktyv kolektiv kolektyv konstruktiv konstruktyv objektiv objektyv perspektiv perspektyv prakti prakty produktiv produktyv respektiv respektyv subjektiv subjektyv takti takty analyti analyty graffiti grafity kriti krity legiti legity pervitin pervityn politi polity pozitiv pozityv primitiv primityv (?Patleti). atlety elektromagneti elektromagnety energeti energety esteti estety eti ety geneti genety kosmeti kosmety (?Pmagneti). magnety marketing marketyng (?Ppeti). pety (?Ppoeti). poety synteti syntety (?Pteoreti). teorety daniel danyel anim anym anit anyt botani botany humani humany manifest manyfest manipul manypul mechani mechany organi organy panik panyk # TODO: if variants are implemented, allow homophone (virgin) panice panyce reorgani reorgany sanit sanyt zorgani zorgany monik monyk veronik veronyk antonio antonyo architektoni architektony ceremoni ceremony chroni chrony (?Pelektroni). elektrony filharmoni filharmony harmoni harmony ironi irony (?Pkoloni). kolony kroni krony mattoni matony monitor monytor symfoni symfony (?Ptelefoni). telefony anti anty argenti argenty (?Pcenti). centy entit entyt (?Pgiganti). giganty identi identy intim intym kontin kontyn mantinel mantynel preventiv preventyv romanti romanty valentine valentajn ventil ventyl imunit imunyt junio junyo komuni komuny (?Pkomunisti). komunysty muni muny unie unye unij unyj unii unyi unií unyí unifor unyfor unikát unykát unikum unykum unipetrol unypetrol univerzit unyverzit univerzál unyverzál (?Pindi). indy kandid kandyd kondi kondy skandin skandyn dominik dominyk aerolini aeroliny defini definy exmini exminy klini kliny lini liny mini miny poliklini polikliny edi edy benedik benedyk encyklopedi encyklopedy expedi expedy ingredi ingredy komedi komedy kredit kredyt medic medyc medik medyk (?Pmedit). medyt mediá medyá profimedia profimédya využ vy-už vyuč vy-uč zadostiuči zadosti-uči tradi trady radik radyk radio rádyo (?Prádi). rády radiá radyá sporadi sporady stadio stadyo stadió stadyó denis denys dennis denys geniál genyál hygieni hygieny penis penys provenien provenyen senio senyo suvereni suvereny tenis tenys emotiv emotyv (?Peroti). eroty flotil flotyl goti goty lokomotiv lokomotyv motiv motyv buddhism budhizm charism charizm fašism fašizm kapitalism kapitalizm katolicism katolicizm liberalism liberalizm metabolism metabolizm nacionalism nacionalizm nacism nacizm rasism rasizm realism realizm socialism socializm terorism terorizm techni techny # TODO: possibly allow for zh- if variants are added in the future...? shod schod sho scho shora zhora shá schá shrn schrn shro schro shled schled tibet tybet tip typ # NOTE: this no-op rule is a way to bypass the rewrite rule above, # which is triggered by a shorter match tipec tipec titul tytul deskriptiv deskriptyv opti opty skepti skepty subtil subtyl exi egzi ordi ordy ferdinand ferdynand kardi kardy koordi koordy verdi verdy pudin pudyn studi study certifi certyfi partici partyci partie partye partii partyi partií partyí partiích partyích sortiment sortyment vertik vertyk dc c komodit komodyt melodi melody metodi metody modifi modyfi parodi parody detail detajl medail medajl trailer trajler fénix fényx géni gény trénin trényn viet vjet (?Pmédi). médy (?Ptragédi). tragédy email ímejl kaliforni kaliforny moderni moderny verni verny nissan nysan nil nyl británi britány albáni albány administrati adminystraty iniciati inyciaty vicky viky patrick patrik frederick frederik rick rik mick mik rock rok exe egze audi audy claudi klaudy (?Podd). od-d (?Podt). ot-t třia tři-a čtyřia čtyři-a štyřia štyři-a štyrya štyry-a th t výu vý-u vyu vy-u vyo vy-o john džon ^(?Psoftware)$ softvér softwar softvér ^(?Phardware) hárdvér hardwar hárdvér filosof filozof (?Pdiplomati). dyplomaty carlos karlos café kafé carmen karmen canon kanon srdc src tesc tesk oscar oskar idio idyo prezidi prezídy prezídi prezídy jidiš jidyš scott skot server servr mozaik mozajk josef jozef klause klauze klausovi klauzovi krause krauze krausovi krauzovi leasing lízing stadium stádyum stadiu stádyu stadia stádya (?Pstádi). stády přesvědč přesvěč svědč svěč asij azij fantasy fantazy etni etny telecom telekom econom ekonom jack džek jazz džez optimisti optymisty protiú proti-ú vyú vy-ú lai la-i mail mejl antibiotik antybiotyk komunism komunyzm chinaski činaski chilli čili stipendi stypendy definiti definyty party párty technoparty technopárty inje iňe pódi pódy slavii slávii slavie slávie # TODO: what about syllabic consonants? → write some rules to label them # optionally washington vošinktn charles čárls potter potr volkswagen folksvágn fiction fikšn design dyzajn asie ázie asii ázii afghánistán afgánystán vitamin vitamín minus mínus celsia celzia impulsy impulzy fischer fišer ophél ofél summit samit toyot tojot optimism optymizm resort rezort iont jont mítink mítynk exo egzo (?Pexoti). egzoty thriller triler přese přeze kreseb krezeb kasin kasín wales vejls hollywood holyvúd phil fil kokain kokajn intuiti intuity indonés indonéz piano piáno orgasm orgazm podtitul podtytul laser lejzr tchajwan tajvan combi kombi copy kopy protein protejn facto fakto pizz pic whisk visk shop šop puzzle pucle diet dyet tequi teki gay gej lady lejdy group grúp bosch boš bush buš brown braun sezona sezóna playoff plejof czech ček grace grejs business biznys pojďme pojťme googl gúgl ^(?Pgoogle)$ gúgl university junyversity country kántry schulz šulc alexander alexandr alois alojz blues blús time tajm black blek instinktiv instynktyv office ofis media médya charlott šarlot charlotte šarlot aaron áron einstein ajnštajn interview intervjú jamese džejmse jimmy džimi holding holdyng # TODO: maybe smuggle a /ɣ/ in there? těchh těh neumann nojman dick dyk churchill čerčil money many boom búm jerry džery green grín beatles bítls peugeot pežot holocaust holokaust credit kredyt tsunami cunami brandy brendy kvantitati kvantytaty schmeling šmeling nicole nykol nicol nykol ^(?Pnikol)$ nykol nikolaj nykolaj nikola nykola (?Pnikol)[kcč] nykol octav oktáv franka frenka czechtrade čektrejd czechtradu čektrejdu debut debit stockholm stokholm people pípl roy roj multimedi multymedy opel opl team tým revue reví rath rát christopher kristofr fair fér berger bergr ^(?Pout)$ aut hadamczik hadamčik fabi fábi flora flóra portfol portfól arthur artur telefónic telefónyk arlene árlín life lajf iq íkvé juli júli franz franc charlie čárlí schumacher šumachr karoser karosér abraham ejbrehem free frí justin džastyn uveďme uveťme biodiverz biodyverz horváth horvát makeup mejkap british brityš rutin rutyn schwarz švarc handicap hendykep ghett get radioaktiv radyoaktyv victor viktor představme předstafme (?Pfarmaceuti). farmaceuty dublin dablin konkurs konkurz cash keš electric elektrik zombie zombí diabetik dyabetyk classic klasik kognitiv kognytyv bismarck bizmark ruin rujn gang geng malajs malajz panasonic panasonyk haag hág ralph ralf seifert sajfrt annette anet museum mjúzíum khaki kaki sherry šeri konsorci konzorci marcus markus sherlock šerlok buďme buťme malcolm malkolm alfred alfréd hacke hek focus fokus fritz fric playboy plejboj piknik piknyk poesi poezi olympic olympik reggae rege menzel mencl (?Pdidakti). dydakty chevrolet ševrolet dalajlam dalajlám ^(?Pfranco)$ franko isaac ajzek tbili t-bili poker pokr genius džínyjus ulti ulty picass pikas diesel dýzl coca koka cola kola cocacola kokakola akreditiv akredytyv isabel izabel innsbruck inzbruk semitism semityzm marco marko použ po-už zentiv zentyv mechanism mechanyzm organism organyzm atlanti atlanty autenti autenty benjam beňam benj bendž dekorativ dekoratyv dinosaur dynosaur hegel hégl jseš seš jsi si jsme sme ^(?Pjsou)$ sou jste ste (?Pkyberneti). kybernety sentiment sentyment telekomuni telekomuny # NOTE: you can also add prefixes that don't contain any rewrites # but that are allowed to occur in front of rewritten substrings ne ne mikro mikro nej nej PK!corpy/phonetics/phones.tsvSAMPA IPA CS CNC # - is just a technical symbol used in exceptions.tsv to prevent # CSPs like voicing assimilation, hiatus insertion, merging of # two vowels into a diphthong etc. - - - - i: iː í í I ɪ i i E ɛ e e E: ɛː é é a a a a a: aː á á o o o o o: oː ó ó u u u u u: uː ú ú o_u o͡u ou̯ ou a_u a͡u au̯ au E_u ɛ͡u eu̯ eu p p p p b b b b t t t t d d d d c c ť ť J\ ɟ ď ď k k k k g ɡ g g f f f f v v v v s s s s z z z z S ʃ š š Z ʒ ž ž x x ch ch h\ ɦ h h t_s ʦ c c t_S ʧ č č m m m m n n n n J ɲ ň ň r r r r l l l l j j j j P\ r̝ ř ř G ɣ ɣ ɣ d_z ʣ dz ʒ d_Z ʤ dž ʒ̆ Q\ r̝̊ ř̭ ř N ŋ ŋ ŋ F ɱ ɱ ɱ r= r̩ r̥ r l= l̩ l̥ l m= m̩ m̥ m ? ʔ ʔ @ ə ə ə PK!.zee!corpy/phonetics/substr2phones.tsvSUBSTR PHONE ch x au a_u eu E_u ou o_u dě J\ E ně J E mě m J E tě c E di J\ I ni J I ti c I dí J\ i: ní J i: tí c i: dz d_z dž d_Z qu k v a a b b c t_s d d e E f f g g h h\ i I j j k k l l m m n n o o p p q k v r r s s t t u u v v w v x k s y I z z á a: ä E ą a m â a é E: ě j E ë E í i: ó o: ö E: ő E: ô o ú u: ů u: ü I ű I ý i: č t_S ć t_S ď J\ đ d ł l ĺ l ľ l ň J ń J ř P\ ŕ r š S ś S ş S ť c ž Z ż Z # - is just a technical symbol used in exceptions.tsv to prevent # CSPs like voicing assimilation, hiatus insertion, merging of # two vowels into a diphthong etc. - - PK!LL!corpy/phonetics/voicing_pairs.tsvDEVOICED VOICED p b t d c J\ k g s z S Z t_s d_z t_S d_Z x G x h\ f v Q\ P\ PK!g4 corpy/scripts/xc.pyimport os.path as osp import click as cli import logging as log import unicodedata as ud from collections import Counter import regex as re from lxml import etree NAME = osp.splitext(osp.basename(__file__))[0] LOG = log.getLogger(NAME) LOGLEVELS = [ s for f, s in sorted( (v, k) for k, v in vars(log).items() if k.isupper() and isinstance(v, int) ) ] NORM_FORMS = ("NFC", "NFD", "NFKC", "NFKD") def count_extended_grapheme_clusters(text): return Counter(m.group() for m in re.finditer(r"\X", text)) def check_normalization(fdist, expected_form="NFC"): LOG.info("Checking normalization of identified extended grapheme clusters.") for extended_grapheme_cluster in fdist.keys(): normalized = ud.normalize(expected_form, extended_grapheme_cluster) if extended_grapheme_cluster != normalized: LOG.warning( f"Expected {normalized!r} according to {expected_form}, got " f"{extended_grapheme_cluster!r} instead!" ) def parse(file, xml=False): if xml: LOG.info(f"Parsing {file.name!r} as XML.") tree = etree.parse(file) for elem in tree.iter(): yield from elem.attrib.values() yield elem.text yield elem.tail else: yield from file def print_fdist(fdist): for extended_grapheme_cluster, count in fdist.most_common(): names, codepoints = [], [] for codepoint in extended_grapheme_cluster: name = ud.name(codepoint, None) # control characters have no names, and for them, we want to print their repr instead codepoints.append(repr(codepoint) if name is None else codepoint) names.append("__NO_NAME__" if name is None else name) print(count, "".join(codepoints), "+".join(names), sep="\t") @cli.command() @cli.option( "--expected-normalization", help="Warn if identified extended grapheme clusters do not " "match expected normalization form.", type=cli.Choice(NORM_FORMS), ) @cli.option("--lower", help="Convert to lowercase before processing.", is_flag=True) @cli.option( "--xml", help="Parse input as XML and process only text nodes and attribute values.", is_flag=True, ) @cli.option( "lvl", "--log", help="Set logging level.", type=cli.Choice(LOGLEVELS), default="WARN", ) @cli.option("--verbose", "-v", help="(Repeatedly) increase logging level.", count=True) @cli.option("--quiet", "-q", help="(Repeatedly) decrease logging level.", count=True) @cli.argument("files", type=cli.File("rt", encoding="utf-8"), nargs=-1) def main(expected_normalization, lower, xml, lvl, verbose, quiet, files): """`wc -c` on steroids. Count extended grapheme clusters, print their frequency distribution. FILES are the files to process. Leave empty or - for STDIN. """ lvl = getattr(log, lvl) - 10 * verbose + 10 * quiet log.basicConfig( level=lvl, format="[%(asctime)s {}:%(levelname)s] %(message)s".format(NAME) ) files = files if files else (cli.File("rt", encoding="utf-8")("-"),) fdist = Counter() LOG.info("Aggregating counts of extended grapheme clusters in input.") for file in files: for fragment in parse(file, xml): if fragment is not None: fragment = fragment.lower() if lower else fragment fdist.update(count_extended_grapheme_clusters(fragment)) if expected_normalization: check_normalization(fdist, expected_normalization) print_fdist(fdist) PK!ڒcorpy/scripts/zip_verticals.pyimport os.path as osp import click as cli import logging as log NAME = osp.splitext(osp.basename(__file__))[0] LOG = log.getLogger(NAME) LOGLEVELS = [ s for f, s in sorted( (v, k) for k, v in vars(log).items() if k.isupper() and isinstance(v, int) ) ] def print_position(lines, line_no): lines = [l.strip(" \n").split("\t") for l in lines] word = lines[0][0] position = [word] for i, line in enumerate(lines): assert line[0] == word, ( f"Expected first attribute {word} but got {line[0]} in vertical " f"#{i+1} at line #{line_no+1}. Are you sure the verticals " "represent the same corpus?" ) position.extend(line[1:]) print("\t".join(position)) @cli.command() @cli.option( "lvl", "--log", help="Set logging level.", type=cli.Choice(LOGLEVELS), default="WARN", ) @cli.option("--verbose", "-v", help="(Repeatedly) increase logging level.", count=True) @cli.option("--quiet", "-q", help="(Repeatedly) decrease logging level.", count=True) @cli.argument("files", type=cli.File("rt", encoding="utf-8"), nargs=-1) def main(lvl, verbose, quiet, files): """Zip verticals together. Intended for "zipping" together verticals of the same corpus. At least one of them must have multiple positional attributes. Structures and the first positional attribute (which is included only once) are taken from the first vertical provided. FILES are the files to process. Leave empty or - for STDIN. """ lvl = getattr(log, lvl) - 10 * verbose + 10 * quiet log.basicConfig( level=lvl, format="[%(asctime)s {}:%(levelname)s] %(message)s".format(NAME) ) files = files if files else (cli.File("rt", encoding="utf-8")("-"),) LOG.info(f"Zipping the following vertical files: {files}") for line_no, lines in enumerate(zip(*files)): if any("\t" in l for l in lines): print_position(lines, line_no) else: print(lines[0].strip(" \n")) LOG.info("Done.") PK!:corpy/util/__init__.py"""Small utility functions. """ from pprint import pprint def _head_gen(items, first_n): for idx, item in enumerate(items): if first_n == idx: break yield item def head(collection, first_n=None): """Inspect `collection`, truncated if too long. If ``first_n=None``, an appropriate value is determined based on the type of the collection. """ type_ = type(collection) if first_n is None: if type_ in (str, bytes): first_n = 100 else: first_n = 10 if len(collection) <= first_n: pprint(collection) return if type_ == str: constructor = "".join elif type_ == bytes: constructor = b"".join else: constructor = type_ items = collection.items() if hasattr(collection, "items") else collection pprint(constructor(_head_gen(items, first_n))) def cmp(lhs, rhs, test="__eq__"): """Wrap assert statement to automatically raise an informative error.""" msg = f"{head(lhs)} {test} {head(rhs)} is not True!" ans = getattr(lhs, test)(rhs) # operators automatically fall back to identity comparison if the # comparison is not implemented for the given types, magic methods don't → # if comparison is not implemented, we must fall back to identity # comparison manually, because NotImplemented is truthy and makes the # assert succeed if ans is NotImplemented: ans = lhs is rhs assert ans, msg PK!~j$$corpy/vertical/__init__.py"""Parse and query corpora in the vertical format. """ # TODO: Put positions in a buffer (queue). Yield the middle position and give a handle on the # context to match and count functions. Gotchas: sattrs will have to be reimplemented if they're to # be available on the context; corpora shorter than the queue size; start and end corner cases # (before the queue fills up / as it's emptying out). import sys import gzip import os.path as osp from typing import List import re import datetime as dt from collections import namedtuple, defaultdict import numpy as np __all__ = ["Vertical", "Syn2015Vertical", "ipm", "arf"] Structure = namedtuple("Structure", "name attrs") UtklTag = namedtuple( "UtklTag", "pos sub gen num case pgen pnum pers tense grad neg act p13 p14 var asp" ) class Vertical: """Base class for a corpus in the vertical format. Create subclasses for specific corpora by at least specifying a list of :attr:`struct_names` and :attr:`posattrs` as class attributes. :param path: Path to the vertical file to work with. :type path: str """ struct_names: List[str] = [] """A list of expected structural attribute tag names.""" posattrs: List[str] = [] """A list of expected positional attributes.""" def __init__(self, path): if not (self.struct_names and self.posattrs): raise Exception( f"The class attributes `struct_names` and `posattrs` must be specified. You " f"probably want to subclass {self.__class__.__name__!r}." ) if not osp.isfile(path): raise Exception(f"File {path!r} does not exist!") self.path = path self._struct_re = re.compile( r"<\s*?(/?)\s*?({})(?:\s*?(/?)\s*?| (.*?))>".format( "|".join(self.struct_names) ) ) self.position_template = namedtuple("Position", self.posattrs) # if an integer > 0, then modulo for reporting progress; if falsey, then turns off reporting self.report = None def open(self): """Open the vertical file in :attr:`self.path`. Override this method in subclasses to specify alternative ways of opening, e.g. using :func:`gzip.open`. """ return open(self.path, "rt") def parse_position(self, position): """Parse a single position from the vertical. Override this method in subclasses to hook into the position parsing process. """ return self.position_template(*position.split("\t")) def positions(self, parse_sattrs=True, ignore_fn=None, hook_fn=None): """Iterate over the positions in the vertical. At any point during the iteration, the structural attributes corresponding to the current position are accessible via :attr:`self.sattrs`. :param parse_sattrs: Whether to parse structural attrs into a dict (default) or just leave the original string (faster). :type parse_sattrs: bool :param ignore_fn: If given, then evaluated at each position; if it returns ``True``, then the position is completely ignored. :type ignore_fn: function(posattrs, sattrs) :param hook_fn: If given, then evaluated at each position. :type hook_fn: function(posattrs, sattrs) """ self.sattrs = {} start = dt.datetime.now() with self.open() as file: for i, line in enumerate(file): line = line.strip(" \n\r") m = self._struct_re.fullmatch(line) if m: close, tag, self_close, attrs = m.groups() if close: self.sattrs.pop(tag) elif self_close: pass else: # TODO: figure out a way to allow nested tags...? if tag in self.sattrs: raise Exception( f"{tag!r} already in `sattrs`; nested tags?" ) if parse_sattrs: attrs = { m.group(1): m.group(2) for m in re.finditer( r'\s*?(\S+?)="([^"]*?)"', "" if attrs is None else attrs, ) } self.sattrs[tag] = attrs else: position = self.parse_position(line) if hook_fn: hook_fn(position, self.sattrs) if not (ignore_fn and ignore_fn(position, self.sattrs)): yield position if self.report and i % self.report == 0: time = dt.datetime.now() - start print(f"Processed {i:,} lines in {time}.", file=sys.stderr) def search(self, match_fn, count_fn=None, **kwargs): """Search the vertical, creating an index of what's been found. :param match_fn: Evaluated at each position to see if the position matches the given search. :type match_fn: function(match_fn, count_fn) :param count_fn: Evaluated at each **matching** position to determine what should be counted at that position (in the sense of being tallied as part of the resulting frequency distribution). If it returns a list, it's understood as a list of things to count. :param kwargs: Passed on to :meth:`~Vertical.positions`. :return: The frequency index of counted "things" and the size of the corpus. :rtype: (dict, int) """ if count_fn is None: count_fn = match_fn index = defaultdict(list) i = None # if loop below happens to have 0 iterations, i would be undefined... for i, position in enumerate(self.positions(**kwargs)): if match_fn(position, self.sattrs): count = count_fn(position, self.sattrs) if isinstance(count, list): for countable in count: index[countable].append(i) else: index[count].append(i) index = {k: np.array(v) for k, v in index.items()} return index, i if i is None else i + 1 # ... and we need to return it here class Syn2015Vertical(Vertical): """A subclass of :class:`Vertical` for the SYN2015 corpus. Refer to :class:`Vertical` for API details. """ struct_names = ["doc", "text", "p", "s", "hi", "lb"] posattrs = [ "word", "lemma", "tag", "proc", "afun", "parent", "eparent", "prep", "p_lemma", "p_tag", "p_afun", "ep_lemma", "ep_tag", "ep_afun", ] def open(self): return gzip.open(self.path, "rt") def parse_position(self, position): position = position.split("\t") position[2] = UtklTag(*position[2]) return self.position_template(*position) class ShuffledSyn2015Vertical(Syn2015Vertical): """A subclass of :class:`Vertical` for the SYN2015 corpus, shuffled. Refer to :class:`Vertical` for API details. """ struct_names = ["block"] + Syn2015Vertical.struct_names def ipm(occurrences, N): """Relative frequency of `occurrences` in corpus, in instances per million. """ return 1e6 * len(occurrences) / N def arf(occurrences, N): """Average reduced frequency of `occurrences` in corpus. """ freq = len(occurrences) if freq == 0: return 0 shifted = np.roll(occurrences, 1) distances = (occurrences - shifted) % N avg_dist = N / freq return sum(min(d, avg_dist) for d in distances) / avg_dist PK!UЙcorpy/vis/__init__.py"""Convenience wrappers for visualizing linguistic data. """ from collections import Counter from collections.abc import Mapping, Iterable import numpy as np from wordcloud import WordCloud CM_PER_IN = 2.54 def size_in_pixels(width, height, unit="in", ppi=300): """Convert size in inches/cm to pixels. :param width: width, measured in `unit` :param height: height, measured in `unit` :param unit: ``"in"`` for inches, ``"cm"`` for centimeters :param ppi: pixels per inch :return: ``(width, height)`` in pixels :rtype: (int, int) Sample values for ppi: - for displays: you can detect your monitor's DPI using the following website: ; a typical value is 96 (of course, double that for HiDPI) - for print output: 300 at least, 600 is high quality """ allowed_units = ("in", "cm") if unit not in allowed_units: raise ValueError(f"`unit` must be one of {allowed_units}.") if unit == "cm": width = width * CM_PER_IN height = height * CM_PER_IN return round(width * ppi), round(height * ppi) def _optimize_dimensions(size, fast, fast_limit): width, height = size # NOTE: Reasonable numbers for width and height are in the hundreds # to low thousands of pixels. If the requested size is large, for # faster results, we shrink the canvas during wordcloud # computation, and only scale it back up during rendering. if fast and width * height > fast_limit ** 2: scale = max(size) / fast_limit width = round(width / scale) height = round(height / scale) else: scale = 1 return width, height, scale def _elliptical_mask(width, height): x_center = half_width = round(width / 2) y_center = half_height = round(height / 2) x = np.arange(0, width) y = np.arange(0, height)[:, None] mat = ((x - x_center) / half_width) ** 2 + ((y - y_center) / half_height) ** 2 return (mat >= 1) * 255 def wordcloud( data, size=(400, 400), *, rounded=False, fast=True, fast_limit=800, **kwargs ): """Generate a wordcloud. If `data` is a string, the wordcloud is generated using the method :meth:`WordCloud.generate_from_text`, which automatically ignores stopwords (customizable with the `stopwords` argument) and includes "collocations" (i.e. bigrams). If `data` is a sequence or a mapping, the wordcloud is generated using the method :meth:`WordCloud.generate_from_frequencies` and these preprocessing responsibilities fall to the user. :param data: input data -- either one long string of text, or an iterable of tokens, or a mapping of word types to their frequencies; use the second or third option if you want full control over the output :param size: size in pixels, as a tuple of integers, (width, height); if you want to specify the size in inches or cm, use the :func:`size_in_pixels` function to generate this tuple :param rounded: whether or not to enclose the wordcloud in an ellipse; incompatible with the `mask` keyword argument :param fast: when ``True``, optimizes large wordclouds for speed of generation rather than precision of word placement :param fast_limit: speed optimizations for "large" wordclouds are applied when the requested canvas size is larger than ``fast_limit**2`` :param kwargs: remaining keyword arguments are passed on to the :class:`wordcloud.WordCloud` initializer :return: The word cloud. :rtype: :class:`wordcloud.WordCloud` """ if rounded and kwargs.get("mask"): raise ValueError("Can't specify `rounded` and `mask` at the same time.") # tweak defaults kwargs.setdefault("background_color", "white") # if Jupyter gets better at rendering transparent images, then # maybe these would be better defaults: # kwargs.setdefault("mode", "RGBA") # kwargs.setdefault("background_color", None) width, height, scale = _optimize_dimensions(size, fast, fast_limit) if rounded: kwargs["mask"] = _elliptical_mask(width, height) wcloud = WordCloud(width=width, height=height, scale=scale, **kwargs) # raw text if isinstance(data, str): return wcloud.generate_from_text(data) # frequency counts elif isinstance(data, Mapping): return wcloud.generate_from_frequencies(data) # tokenized text # NOTE: the second condition is there because of nltk.text, which # behaves like an Iterable / Collection / Sequence for all # practical intents and purposes, but the corresponding abstract # base classes don't pick up on it (maybe because it only has a # __getitem__ magic method?) elif isinstance(data, Iterable) or hasattr(data, "__getitem__"): return wcloud.generate_from_frequencies(Counter(data)) else: raise ValueError( "`data` must be a string, a mapping from words to frequencies, or an iterable of words." ) def _wordcloud_png(wcloud): from IPython.display import display return display(wcloud.to_image()) try: from IPython import get_ipython _ipython = get_ipython() # pylint: disable=invalid-name if _ipython is not None: _png_formatter = _ipython.display_formatter.formatters[ # pylint: disable=invalid-name "image/png" ] _png_formatter.for_type(WordCloud, _wordcloud_png) except ImportError: pass PK!H b>[&corpy-0.2.0.dist-info/entry_points.txtN+I/N.,()HM/*ԃU$[&fqUe薥d&' ʹPK!Hu)GTUcorpy-0.2.0.dist-info/WHEEL HM K-*ϳR03rOK-J,/R(O-)$qzd&Y)r$UV&UrPK!H. corpy-0.2.0.dist-info/METADATAVKr7HUP-njdGRʲ̲xi3`r5W1)٬ =_~D!$WL` ֵ[1|ǦCi{e*TQTWO,T g}sK1|8G'@c|P\Oq u8Ws[6Pz,ʮ(x}EQbN~80K{~4X߉XJ$8doߟq^ɂ/NhƹV:j (A:jƏvNhV0ЧCxEE;gЊ|YXCE1:^7}{7.ko NF[B-L<06̕кcM >:6B #XbaLY1 XMJOkĖ8`,]R|$R5%A27{h'ud8xӧAbl\[=X6KRbo5¾.)2s.1G kA VŰ2J:5*b@$Z9X^ cCFH=ֶRyL4U۳mf{SRn\סѤ8LˈNCNٛ}.&E'_H`DW X @Wʯ)Icezs~E^i^ &%4)hFBbiTLZ8HC?;CK96\_hH787v. yA8y-/oQ-ef_lU.څWag4\h 1(\V%")%ٙA:-&Uh-9׌+hU;FCE?xkpԜi?WΟaȕa/vA8b+P/d* _[x[Hn/blĶ6pIIg4^,RQH[r+ɷ"쳪tSҬ%ǒPK!H^corpy-0.2.0.dist-info/RECORDuɒJ}? Xâ( *d|FT]6'?㏓Q6 "_53}NgpLH03IT]c`i''#3#_|̏:(E6.%"&S͜F=&{@~M-fK aϿ(Ȳ$ [lh/g>?]E %5kGL`obTpzdJ 弾Xd|ap8,FԭY}*%pZ}f{ϵ KFdrdߒQigx3MP#ab(d6<$aVY&{Ɍz,pTʐ(=\0'}[kƲ+jEb!ViTz=';NT`Mc,k>I5 0w?;Mk6w*b0 n6.A0A4$KJJWz+؈{D>5.5Bpd\ȱ|bĿ_{V6Yư_PK!{{corpy/__init__.pyPK!Z?corpy/morphodita/__init__.pyPK!ڔw corpy/morphodita/tagger.pyPK!/| | ,#corpy/morphodita/tokenizer.pyPK!?|8|8/corpy/phonetics/cs.pyPK!nL''hcorpy/phonetics/exceptions.tsvPK!corpy/phonetics/phones.tsvPK!.zee!corpy/phonetics/substr2phones.tsvPK!LL!?corpy/phonetics/voicing_pairs.tsvPK!g4 ʖcorpy/scripts/xc.pyPK!ڒcorpy/scripts/zip_verticals.pyPK!:,corpy/util/__init__.pyPK!~j$$7corpy/vertical/__init__.pyPK!UЙcorpy/vis/__init__.pyPK!H b>[&_corpy-0.2.0.dist-info/entry_points.txtPK!Hu)GTUcorpy-0.2.0.dist-info/WHEELPK!H. ncorpy-0.2.0.dist-info/METADATAPK!H^corpy-0.2.0.dist-info/RECORDPK Y