PKKFOv++contex/analyze_regex.pyfrom sre_constants import * import sre_parse def group_parents(regex): """ Return a dict of the form {: } where is a set of the group-indices of all the groups the is nested in. """ tree = sre_parse.parse(regex) info = {} _fill_group_info(tree, info, frozenset([0])) # All groups has 0 as a parent info[0] = frozenset() return info def _fill_group_info(subpattern, info, parents): """ Based on the algorithm of `sre_parse.SubPattern.dump`. This is not desirable, as the internal workings of `sre_parse` might be changed one day, though this seems to work fine in both python2.7 and 3.4. What else am I to do? Make my own parser for python regexes? That could easily introduce new bugs. Here's why I need to do this: https://mail.python.org/pipermail/python-list/2015-April/701611.html """ for op, av in subpattern: if op == IN: pass elif op == SUBPATTERN: group_number, content = av if group_number is None: # Non-capturing groups etc _fill_group_info(content, info, parents) else: info[group_number] = parents _fill_group_info(content, info, parents | frozenset([group_number])) elif op == BRANCH: for i, a in enumerate(av[1]): _fill_group_info(a, info, parents) elif op == GROUPREF_EXISTS: condgroup, item_yes, item_no = av _fill_group_info(item_yes, info, parents) if item_no: _fill_group_info(item_no, info, parents) elif isinstance(av, (tuple, list)): for a in av: if isinstance(a, sre_parse.SubPattern): _fill_group_info(a, info, parents) PKGRoWWcontex/__init__.py""" Copyright (C) 2015 Mattias Ugelvik This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ __all__ = ['StringContext', 'MatchContext', 'match', 'search', 'find', 'T', 'rules'] from . import analyze_regex import collections import itertools import fnmatch import re def T(string): """ Returns a `StringContext` version of `string`. The focus point is the whole string. """ return StringContext(focus=string) def search(string, pattern, flags=0): """ Short for `T(string).search(pattern, flags)` """ return T(string).search(pattern, flags) def match(string, pattern, flags=0): """ Short for `T(string).match(pattern, flags)` """ return T(string).match(pattern, flags) def find(string, substring, right_side=False): """ Short for `T(string).find(substring, right_side=right_side)` """ return T(string).find(substring, right_side=right_side) def positive_index(i, length): """ Convert a negative index to a positive one. """ return i if i >= 0 else length + i def ensure_str(val): """ Used to convert things that might be either `StringContext` or `str` into `str`. I could use just `str(val)`, that would work, but I want to keep open the possibility of `val` being something else, like a bytes object. """ return str(val) if isinstance(val, StringContext) else val class StringContext: """ A StringContext is like a string, but has three parts: the `before`, the `focus`, and the `after`. The `focus` is the part which you are currently 'working on'. StringContext's are immutable (or are treated as such, that's probably a more accurate statement), and any method that makes 'changes' returns a new copy. All methods operate on the whole string (i.e. not just the `focus`) unless otherwise noted. Indices refer to the whole string, not just `focus`. """ def __init__(self, before='', focus='', after=''): self.before, self.focus, self.after = self.tup = (before, focus, after) self._out = None def parts(self): """ Return a tuple of the form `(, , )`. """ return self.tup def replace(self, replacement): """ This will do one of two things: * If `callable(replacement)`, then return a new version of the string in which the new `focus` is `str(replacement(old_focus))`. * When it is a string it will replace `focus` with `str(replacement)`. """ return StringContext(focus=str(replacement(self.focus) if callable(replacement) else replacement), before=self.before, after=self.after) def focus_point(self, do_tuple=False): """ Return a slice object that identifies the current `focus`. Return it as a tuple of the form `(start, stop)` if `do_tuple`. """ start = len(self.before) stop = start + len(self.focus) return (start, stop) if do_tuple else slice(start, stop) def find(self, string, right_side=False): """ Find the first location of `string` and set that to be the `focus`. Returns None if it doesn't find the string. If `right_side` then it will search from right to left. """ result = (str.rfind if right_side else str.find)(str(self), string) # Using __getitem__ manually because MatchContext compliance return None if result == -1 else StringContext.__getitem__(self, slice(result, result+len(string))) def search(self, regex, flags=0): """ Search for the regular expression `regex` in the string with the regex flags `flags` and return a `MatchContext`, which is like a `StringContext` but with information pertaining to the match. It uses `re.search` for the matching. Returns `None` if the search is unsuccessful. """ return MatchContext.new(regex, str(self), re.search, flags) def match(self, regex, flags=0): """ Like `StringContext.search`, but uses `re.match` for the matching. """ return MatchContext.new(regex, str(self), re.match, flags) def __getitem__(self, key): """ This will return a new string where `key` designates the focus point. Slicing works like with normal strings, except that the 'step' isn't supported. """ string = str(self) if isinstance(key, int): key = positive_index(key, len(string)) return StringContext(focus=string[key], before=string[:key], after=string[key+1:]) elif isinstance(key, slice): assert key.step is None, "`step` isn't supported" return StringContext(focus=string[key], before=string[:0 if key.start is None else key.start], after=string[len(string) if key.stop is None else key.stop:]) return key def __iter__(self): """ Returns `iter(str(self))` """ return iter(str(self)) def __hash__(self): """ Returns `hash(str(self))` """ return hash(str(self)) def __len__(self): """ The total length of the string, including the `before` and `after`. """ return len(str(self)) def __repr__(self): return '{}({!r}, {!r}, {!r})'.format(self.__class__.__name__, *self.tup) def __eq__(self, other): """ Will compare equal to `str` and `StringContext` instances where the content of the strings matches. So the current `focus` doesn't matter. """ return str(self) == ensure_str(other) def __str__(self): """ Stitch together `before`, `focus` and `after` to make a normal string. """ if self._out: return self._out else: self._out = ''.join(self.tup) return self._out def get_index_groups(match): """ I need to do this because `match.lastindex` isn't reliable (the last, nested subgroup doesn't count for some reason). """ i = 0 while True: try: yield (i, match.span(i)) except IndexError: break i += 1 def get_groups(match): """ `match` is a match object from the `re` module. Returns a dict of the form {: (, )} where the 's are positional regex groups like 0, 1, 2, etc. or named regex groups. and are the indices, and together they form the `span` over the string, which may have to be transformed in the course of string manipulation. and are both called `points`, and the rules of transforming them are the same, which is helpful for code clarity. Regex groups that didn't match will not be present. """ groups = {name: match.span(name) for name in match.groupdict() if match.span(name) != (-1, -1)} groups.update({i: span for i, span in get_index_groups(match) if span != (-1, -1)}) # Removing (-1, -1), because those groups didn't match return groups def change_groups(groups, changefunc): """ Change every point with the function `changefunc`. `groups` is a mapping from group->span returned by `get_groups`. """ return {name: changefunc(name, start, stop) for name, (start, stop) in groups.items()} class MatchContext(StringContext): """ `MatchContext` is like `StringContext`, but also keeps track of matched regex groups and moves them around to the appropriate places when the string is manipulated. """ @staticmethod def new(regex, string, method, flags=0, group_parents=None): match = method(regex, string, flags) if match: group_span = get_groups(match) group_index = {g: g for g in group_span if isinstance(g, int)} group_index.update(match.re.groupindex) start, stop = group_span[0] group_parents = analyze_regex.group_parents(regex) if group_parents is None else group_parents return MatchContext.new_focus(string, group_parents, group_span, group_index, 0) @staticmethod def new_focus(string, group_parents, group_span, group_index, focus_group): span = group_span.get(focus_group) if span is None: raise ValueError("Regex group {!r} didn't match anything (or it doesn't exist)".format(focus_group)) else: start, stop = span return MatchContext(before=string[:start], focus=string[start:stop], after=string[stop:], group_parents=group_parents, group_span=group_span, group_index=group_index, focus_group=focus_group) def __init__(self, before, focus, after, group_parents, group_span, group_index, focus_group): StringContext.__init__(self, before=before, focus=focus, after=after) self.group_parents = group_parents self.group_span = group_span self.group_index = group_index self.focus_group = focus_group def parents(self, group_id): """ Return a set of group-indexes of the parent regex groups of `group_id`. """ return self.group_parents[self.group_index[group_id]] def __getitem__(self, key): """ `MatchContext` doesn't support slicing. This will return a `StringContext` object, and will thus lose the regex group information """ return StringContext.__getitem__(self, key) def replace(self, replacement): """ This will do one of two things: * If `callable(replacement)`, then return a new version of the string in which the new `focus` is `str(replacement(old_focus))`. * Else it will replace `focus` with `str(replacement)`. This will move around the points of the matched regex groups to keep them relevant. Lets say that the ()'s signify the various groups in this string: '(He)ll(o)(!)'. If your `focus` is the second group (which it would be if you did `str.group(2)`) and you did `str.replace('')`, then the result would be '(He)ll()(!)'. The logic for the points in the third group is: 'looks like the replaced focus lost 1 char, and I am ahead in the string, so I better move back one char too'. If your `focus` is the outer-most regex group here: 'Hay, ((ho ho), yolo) yoyo', and you replace the focus with 'ho', then the result will be this: 'Hay, ((ho)) yoyo'. The nested group shrinks. But if your `focus` is the outer-most regex group here: 'Hay, ((ho)) yoyo', and you replace the focus with 'YOYO', then the result will be this: 'Hay, ((YO)YO) yoyo'. The nested group doesn't see any reason to expand. """ replacement = str(replacement(self.focus) if callable(replacement) else replacement) length = len(self.focus) change = len(replacement) - length start = len(self.before) newstop = start+length+change focus_idx = self.group_index[self.focus_group] def change_span(group, span_start, span_stop): gindex = self.group_index[group] if gindex == focus_idx: # Same group return (span_start, span_stop+change) elif gindex in self.group_parents[focus_idx]: # group is a parent of focus_group return (span_start, span_stop+change) elif focus_idx in self.group_parents[gindex]: # group is a child of focus_group return (min(newstop, span_start), min(newstop, span_stop)) else: # The groups aren't nested in any way if gindex > focus_idx: return (span_start+change, span_stop+change) else: return (span_start, span_stop) new_group_span = change_groups(self.group_span, change_span) return MatchContext(group_span=new_group_span, before=self.before, focus=replacement, after=self.after, group_parents=self.group_parents, group_index=self.group_index, focus_group=self.focus_group) def group(self, group_id): """ Sets the focus point to the part which the regex group `group_id` matched. `group_id` can be an integer signifying a positional regex group (0 is the whole string, 1 is first group, 2 second, and so on) or a string signifying a named regex group. """ return MatchContext.new_focus(str(self), self.group_parents, self.group_span, self.group_index, group_id) def apply_rules(self, rules): """ Apply the `rules` and return the resulting MatchContext (with focus changed). The form of the rules are described in the docstring for `contex.rules`. """ result = self for area, replacement in sorted(rules.items(), key=lambda rule: len(self.parents(rule[0])), # ^ Replace the areas with the most amount of parents first. This way, # the innermost regex groups are always replaced first. reverse=True): result = result.group(area).replace(replacement) return result def expand(self, template, **kwargs): """ Like `str.format` but the indices in the template are the positional regex groups and the names are either named regex groups or keyword arguments to `expand` (the latter take precedence). For example: >>> name = contex.match('Bob Bobbison!', r'(?P\w+) (?P\w+)(!)?') >>> name.expand('{3:+<4} {surname} {hey} {forename} {1}', hey='YoYo', forename='Argh') '!+++ Bobbison YoYo Argh Bob' The `{3}` refer to the third regex group which is the exclam; `forename` did refer to the regex group but was overwritten by the keyword argument to `expand`; The `forename` group was available through `{1}` however, because that group was also regex group #1. """ out = str(self) # I distribute the groups into the named groups and the index groups to use them # with `str.format`. named_groups = { group: out[slice(*span)] for group, span in self.group_span.items() if isinstance(group, str) } named_groups.update(kwargs) index_groups = tuple(string for (index, string) in sorted({ group: out[slice(*span)] for group, span in self.group_span.items() if isinstance(group, int) }.items())) return template.format(*index_groups, **named_groups) def __repr__(self): return '<{} object; group={!r}, tup={!r}>'.format(self.__class__.__name__, self.focus_group, self.tup) class Transformer: def __init__(self, regex, rules, method, flags=0): self.regex = re.compile(regex, flags=flags) self.group_parents = analyze_regex.group_parents(regex) self.rules = rules self.method = method def _apply(self, string): view = MatchContext.new(self.regex, string, self.method, group_parents=self.group_parents) return None if view is None else view.apply_rules(self.rules) def apply(self, string): """ Performs the replacements according to the rules and returns the resulting string. Returns `None` if the `string` doesn't match. """ result = self._apply(string) return None if result is None else str(result) __call__ = apply def expand(self, string, template, **kwargs): """ Like `str.format` but the indices in the template are the positional regex groups and the names are either named regex groups or keyword arguments to `expand` (the latter take precedence). For example: >>> t = contex.rules(r'(?P\w+) (?P\w+)', {}) >>> t.expand('Bob Bobbington', '{greeting}, Mr. {last_name}!', greeting='Welcome') 'Welcome, Mr. Bobbington!' It will also apply the rules (in the example above there are none) before expanding. Returns `None` if the `string` doesn't match. More info can be found in `MatchContext.expand`. """ result = self._apply(string) return None if result is None else result.expand(template, **kwargs) def __repr__(self): return '<{} object; regex={!r}, rules={!r}, method={}.{}>'.format( self.__class__.__name__, self.regex, self.rules, self.method.__module__, self.method.__name__ ) def rules(regex, rule_dict, method=None, flags=0): """ Usage: contex.rules(regex, rule_dict, method=re.fullmatch, flags=0) Returns a `Transformer` object ready to transform strings according to the rules you have set in `rule_dict`. The `regex` will be matched against strings using `re.fullmatch` (or an equivalent behaviour will be arranged for earlier versions of python), and you can refer to matched regex groups in the `rule_dict`. If you want to use a different function to search with, such as `re.search`, then you can pass that in as the `method` argument. Although the `regex` might match only a substring if you use `re.search`, you will still manipulate the string as a whole. `regex` must be a `str`, and it will be compiled with `re.compile`, along with the `flags` you pass in. The rule dict is of the form {: }. can be: * int, designating a regular expression group. * str, designating a named regular expression group. The can be: * str, designating a verbatim string replacement. * unary function. It will be passed the text of to modify, and should return the new replacement which will automatically be converted to a str. If one is inside another in the same rule_dict, then the innermost will be replaced first. This replacement will be visible to the replacement for the outer if it happens to be a function. You can then use the `.apply` or `.expand` methods of the `Transformer` object to manipulate the string. * Transformer.apply(string) will merely do the replacements and return the resulting string. You can also call the transformer object as a function which does the same thing. * Transformer.expand(string, template, **kwargs) also does the replacements, but will fill in the `template` string with info from the match and from keyword arguments. The format for the template string is the same as with `str.format`, but the numbers refer to positional regex group and the strings to named regex groups (unless overwritten by the **kwargs). Here is an example: >>> contex.rules(r'(Photo)(\d+)_(?P\d{4})\.(?Pjpg)', { 1: 'Foto', # I like to spell it this way better 2: lambda num: int(num) + 1, # Turns out the numbers should be 1 higher 'year': lambda year: year[-2:], # I don't need all 4 digits 'suffix': 'jpeg' }).apply('Photo25_2009.jpg') 'Foto26_09.jpeg' Pretty readable huh, considering how complicated and numerous those replacements are? Now lets say that you want to make the changes like above, but that you also want to change the layout of the filename to `--Foto.`. This is how: >>> contex.rules(r'Photo(\d+)_(?P\d{4})\.jpg', { 1: lambda num: int(num) + 1, # Turns out the numbers should be 1 higher 'year': lambda year: year[-2:] # I don't need all 4 digits }).expand('Photo25_2009.jpg', '{year}-{1}-Foto.jpeg') '09-26-Foto.jpeg' This also allowed me to remove a couple of rules. Here is an example of using regex flags and specifying a method: >>> contex.rules(r"FEEL", {0: "YOLO"}, method=re.search, flags=re.I).apply("I feel great") 'I YOLO great' The 0th regex group designates the whole match, as is tradition, but the whole match in this case is only a substring of the string you're manipulating. If you want only the match area returned, you can use `.expand(string, "{0}")` to extract it. """ if method is None: if not hasattr(re, 'fullmatch'): # Needed for earlier versions of python that doesn't have `re.fullmatch` # \Z matches end-of-string, together with `re.match` it should work just # like `re.fullmatch`. \Z\Z still matches the end-of-string. regex = regex+'\Z' method = re.match else: method = re.fullmatch return Transformer(regex, rule_dict, method, flags) PKݓGD"D"$contex-3.1.dist-info/DESCRIPTION.rstContex - Contextual string manipulation ======================================= Abstract --------- This package provides ``contex.rules``, an interface which enables a very declarative form of string manipulation, where you can manipulate a string "in one go" in sophisticated ways. This library also provides two related abstractions, ``StringContext`` and ``MatchContext``, which can be used for a more stateful manipulation of strings. I recommend using ``contex.rules`` as I think that makes for more readable code. Nevertheless, those abstractions are well documented and might usefully serve as building blocks. Indeed, ``contex.rules`` is implemented on top of them. The problem with our interfaces for string manipulation ------------------------------------------------------- My motivation for creating this package was that I was assigned a task in which it was necessary to change strings such as ``'1_Photo032-2008.jpg'`` into ``'1_Photo031-2008.jpg'``. All the numbers could vary between filenames, and it seemed like I always had to do something inelegant to accomplish this task. Maybe it was to match the various parts and stich them back together: .. code-block:: python >>> match = re.fullmatch('(\d+)_Photo(\d+)-(\d+)\.jpg', '1_Photo032-2008.jpg') >>> '{}_Photo{}-{}.jpg'.format(match.group(1), '{:0>3}'.format(int(match.group(2))-1), match.group(3)) '1_Photo031-2008.jpg' Or using ``re.sub`` with non-consuming regex groups to match the correct area of the string: .. code-block:: python >>> re.sub('(\d+)(?=-\d+\.jpg)', lambda m: '{:0>3}'.format(int(m.group(1))-1), '1_Photo032-2008.jpg') '1_Photo031-2008.jpg' Shouldn't this be simpler? Describing that string with a regular expression is simple enough, and I'm only changing one little part of the string, so why do I have to fiddle around with indices, and why do I have to sacrifice readability? Most importantly, why do I have to experience this aesthetic pain deep in my heart? First attempt: stateful manipulation ------------------------------------ My first idea was that our abstractions aren't fit for this sort of problem. Strings are flat, they have no sense of context, and if you pull out a substring then it requires special effort to stich it back together. The solution? Just keep track of the ``before`` and the ``after``: .. code-block:: python >>> view = contex.match('1_Photo032-2008.jpg', '\d+_Photo(?P\d+)-\d+\.jpg') >>> view >>> view.group('number') >>> result = view.group('number').replace(lambda n: '{:0>3}'.format(int(n)-1)) >>> result >>> str(result) '1_Photo031-2008.jpg' >>> This way I can move around the "focus point" of the string with methods such as ``.group``, manipulate that space, and when I'm done convert it back to a ``str``. I can even manipulate more than one area of the string: .. code-block:: python >>> view = contex.match('1_Photo032-2008.jpg', '\d+_Photo(?P\d+)-(?P\d+)\.jpg') >>> view.group('number').replace('').group('year').replace(lambda y: y[-2:]) >>> ``MatchContext`` keeps track of where the matched regular expression groups are: Even though I removed the content of the "number" group, ``MatchContext`` knows where to find and replace the "year" group. It can also deal with nested regex groups, 0-length matches etc. .. note:: Previously (v2.0.1 and earlier) I allowed arbitrary slicing on ``MatchContext`` objects to select the focus point in addition to the ``.group`` method. This was a mistake. When you're dealing with 0-length slices and adjacent regex groups that matched 0-length strings, there arises serious problems of semantics. I found out that the expected semantics is inextricably linked to which regex group you previously selected with ``.group``, and therefore had to disallow slicing for ``MatchContext`` objects. Removing the state: Vive la Revolution -------------------------------------- The ``MatchContext`` abstraction certainly is an improvement for these particular types of problems, but there is one downside to it, and that is that it adds an additional layer of state to ordinary strings: The programmer must remember which part of the string is in "focus", or, in other words, which state the string is in. So my next challenge was to eliminate the state. What I found out was that only in rare cases is the state needed or useful, and this lead me to believe that the fundamental problem isn't really the abstractions we use for representing strings, but rather the interfaces we have for manipulating them. Thus, pardon the pun, enter ``contex.rules``: .. code-block:: python >>> contex.rules('\d+_Photo(?P\d+)-(?P\d+)\.jpg', { ... 'number': lambda n: '{:0>3}'.format(int(n) - 1), ... 'year': lambda y: y[-2:] ... }).apply('1_Photo032-2008.jpg') '1_Photo031-08.jpg' Or maybe I want to change the layout of the filename completely: .. code-block:: python >>> contex.rules('(\d+)_Photo(?P\d+)-(?P\d+)\.jpg', { ... 'number': lambda n: int(n) - 1, ... 'year': lambda y: y[-2:] ... }).expand('1_Photo032-2008.jpg', 'Photo_{1}_{number:0>3}-{year}.jpeg') 'Photo_1_031-08.jpeg' The string manipulation is done in one go. The programmer doesn't need to remember where the focus point is right now, or specify which order to do the replacements in. This is a much more *declarative* interface: you tell it what the string looks like, what changes you want made, and it figures out the rest. You don't need to stich the pieces back together, and can create more readable regular expressions as well because of that. Nested regex groups are also allowed: the nested one will be replaced first (which will make a difference if the replacement for the outer group is a callable). More advanced example ^^^^^^^^^^^^^^^^^^^^^ Here's an example using ``re.search`` (as opposed to ``re.fullmatch``, which is the default): .. code-block:: python >>> contex.rules('(?P\d)\d{3}', { ... 'millennium': lambda s: int(s)+1, ... 0: lambda y: '{}'.format(y) ... }, method=re.search).apply('Current year: 2015') 'Current year: 3015' Notice that the ``'millennium'`` group is replaced before the ``0`` group. ``contex.rules`` is explained in more detail in its very long docstring. Doubtful stability ------------------ In order to retrieve certain information about the regular expressions to resolve ambiguities related to 0-length matches and so on, I've seen it necessary to use ``sre_parse.parse`` to parse the regular expressions. This is an "internal support module" or something like that, and the stability of this library becomes doubtful as a result. My judgement was that it would take a lot of time and effort to create my own parser for python regular expressions, and I could easily create some bugs in that parser too. Conclusion ---------- I hope that the examples of ``contex.rules`` I have given are sufficiently intuitive so that any programmer can look at them and infer pretty accurately what they do, because the whole point of this endeavor is to increase readability. Furthermore, I'd be interested to see if other people can take this idea ``^\w{7}`` Using Contex ------------ The ``contex`` package contains 5 functions: - ``rules(regex, rule_dict, method=re.fullmatch, flags=0)`` for declarative string manipulation. - ``T(string)`` for converting a string into a ``StringContext`` object. - ``search(string, pattern, flags=0)`` and - ``match(string, pattern, flags=0)`` for regex searches (with the same semantic difference as in the ``re`` module). They both return a ``MatchContext`` object. - ``find(string, substring, right_side=False)`` for finding a substring, returns a ``StringContext`` object. ``contex`` also contains the ``StringContext`` and ``MatchContext`` classes. Installing ---------- ``contex`` should work in both Python 2.7 and 3. Install with ``$ pip install contex``. If you want to install for Python 3 you might want to replace ``pip`` with ``pip3``, depending on how your system is configured. Developing ---------- Contex is documented and tested. Run ``$ nosetests`` or ``$ python3 setup.py test`` to run the tests. The code is hosted at https://notabug.org/Uglemat/Contex License ------- The library is licensed under the GNU General Public License 3 or later. This README file is public domain. PKݓGP"contex-3.1.dist-info/metadata.json{"classifiers": ["Topic :: Text Processing :: General", "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", "Programming Language :: Python :: 3", "Programming Language :: Python :: 2.7", "Intended Audience :: Developers", "Topic :: Software Development"], "extensions": {"python.details": {"contacts": [{"email": "uglemat@gmail.com", "name": "Mattias Ugelvik", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}, "project_urls": {"Home": "https://notabug.org/Uglemat/Contex"}}}, "generator": "bdist_wheel (0.24.0)", "license": "GPL3+", "metadata_version": "2.0", "name": "contex", "summary": "Contextual string manipulation", "test_requires": [{"requires": ["nose"]}], "version": "3.1"}PKݓGEU"contex-3.1.dist-info/top_level.txtcontex PKݓG[C\\contex-3.1.dist-info/WHEELWheel-Version: 1.0 Generator: bdist_wheel (0.24.0) Root-Is-Purelib: true Tag: py3-none-any PKݓG2';zc$c$contex-3.1.dist-info/METADATAMetadata-Version: 2.0 Name: contex Version: 3.1 Summary: Contextual string manipulation Home-page: https://notabug.org/Uglemat/Contex Author: Mattias Ugelvik Author-email: uglemat@gmail.com License: GPL3+ Platform: UNKNOWN Classifier: Topic :: Text Processing :: General Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+) Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 2.7 Classifier: Intended Audience :: Developers Classifier: Topic :: Software Development Contex - Contextual string manipulation ======================================= Abstract --------- This package provides ``contex.rules``, an interface which enables a very declarative form of string manipulation, where you can manipulate a string "in one go" in sophisticated ways. This library also provides two related abstractions, ``StringContext`` and ``MatchContext``, which can be used for a more stateful manipulation of strings. I recommend using ``contex.rules`` as I think that makes for more readable code. Nevertheless, those abstractions are well documented and might usefully serve as building blocks. Indeed, ``contex.rules`` is implemented on top of them. The problem with our interfaces for string manipulation ------------------------------------------------------- My motivation for creating this package was that I was assigned a task in which it was necessary to change strings such as ``'1_Photo032-2008.jpg'`` into ``'1_Photo031-2008.jpg'``. All the numbers could vary between filenames, and it seemed like I always had to do something inelegant to accomplish this task. Maybe it was to match the various parts and stich them back together: .. code-block:: python >>> match = re.fullmatch('(\d+)_Photo(\d+)-(\d+)\.jpg', '1_Photo032-2008.jpg') >>> '{}_Photo{}-{}.jpg'.format(match.group(1), '{:0>3}'.format(int(match.group(2))-1), match.group(3)) '1_Photo031-2008.jpg' Or using ``re.sub`` with non-consuming regex groups to match the correct area of the string: .. code-block:: python >>> re.sub('(\d+)(?=-\d+\.jpg)', lambda m: '{:0>3}'.format(int(m.group(1))-1), '1_Photo032-2008.jpg') '1_Photo031-2008.jpg' Shouldn't this be simpler? Describing that string with a regular expression is simple enough, and I'm only changing one little part of the string, so why do I have to fiddle around with indices, and why do I have to sacrifice readability? Most importantly, why do I have to experience this aesthetic pain deep in my heart? First attempt: stateful manipulation ------------------------------------ My first idea was that our abstractions aren't fit for this sort of problem. Strings are flat, they have no sense of context, and if you pull out a substring then it requires special effort to stich it back together. The solution? Just keep track of the ``before`` and the ``after``: .. code-block:: python >>> view = contex.match('1_Photo032-2008.jpg', '\d+_Photo(?P\d+)-\d+\.jpg') >>> view >>> view.group('number') >>> result = view.group('number').replace(lambda n: '{:0>3}'.format(int(n)-1)) >>> result >>> str(result) '1_Photo031-2008.jpg' >>> This way I can move around the "focus point" of the string with methods such as ``.group``, manipulate that space, and when I'm done convert it back to a ``str``. I can even manipulate more than one area of the string: .. code-block:: python >>> view = contex.match('1_Photo032-2008.jpg', '\d+_Photo(?P\d+)-(?P\d+)\.jpg') >>> view.group('number').replace('').group('year').replace(lambda y: y[-2:]) >>> ``MatchContext`` keeps track of where the matched regular expression groups are: Even though I removed the content of the "number" group, ``MatchContext`` knows where to find and replace the "year" group. It can also deal with nested regex groups, 0-length matches etc. .. note:: Previously (v2.0.1 and earlier) I allowed arbitrary slicing on ``MatchContext`` objects to select the focus point in addition to the ``.group`` method. This was a mistake. When you're dealing with 0-length slices and adjacent regex groups that matched 0-length strings, there arises serious problems of semantics. I found out that the expected semantics is inextricably linked to which regex group you previously selected with ``.group``, and therefore had to disallow slicing for ``MatchContext`` objects. Removing the state: Vive la Revolution -------------------------------------- The ``MatchContext`` abstraction certainly is an improvement for these particular types of problems, but there is one downside to it, and that is that it adds an additional layer of state to ordinary strings: The programmer must remember which part of the string is in "focus", or, in other words, which state the string is in. So my next challenge was to eliminate the state. What I found out was that only in rare cases is the state needed or useful, and this lead me to believe that the fundamental problem isn't really the abstractions we use for representing strings, but rather the interfaces we have for manipulating them. Thus, pardon the pun, enter ``contex.rules``: .. code-block:: python >>> contex.rules('\d+_Photo(?P\d+)-(?P\d+)\.jpg', { ... 'number': lambda n: '{:0>3}'.format(int(n) - 1), ... 'year': lambda y: y[-2:] ... }).apply('1_Photo032-2008.jpg') '1_Photo031-08.jpg' Or maybe I want to change the layout of the filename completely: .. code-block:: python >>> contex.rules('(\d+)_Photo(?P\d+)-(?P\d+)\.jpg', { ... 'number': lambda n: int(n) - 1, ... 'year': lambda y: y[-2:] ... }).expand('1_Photo032-2008.jpg', 'Photo_{1}_{number:0>3}-{year}.jpeg') 'Photo_1_031-08.jpeg' The string manipulation is done in one go. The programmer doesn't need to remember where the focus point is right now, or specify which order to do the replacements in. This is a much more *declarative* interface: you tell it what the string looks like, what changes you want made, and it figures out the rest. You don't need to stich the pieces back together, and can create more readable regular expressions as well because of that. Nested regex groups are also allowed: the nested one will be replaced first (which will make a difference if the replacement for the outer group is a callable). More advanced example ^^^^^^^^^^^^^^^^^^^^^ Here's an example using ``re.search`` (as opposed to ``re.fullmatch``, which is the default): .. code-block:: python >>> contex.rules('(?P\d)\d{3}', { ... 'millennium': lambda s: int(s)+1, ... 0: lambda y: '{}'.format(y) ... }, method=re.search).apply('Current year: 2015') 'Current year: 3015' Notice that the ``'millennium'`` group is replaced before the ``0`` group. ``contex.rules`` is explained in more detail in its very long docstring. Doubtful stability ------------------ In order to retrieve certain information about the regular expressions to resolve ambiguities related to 0-length matches and so on, I've seen it necessary to use ``sre_parse.parse`` to parse the regular expressions. This is an "internal support module" or something like that, and the stability of this library becomes doubtful as a result. My judgement was that it would take a lot of time and effort to create my own parser for python regular expressions, and I could easily create some bugs in that parser too. Conclusion ---------- I hope that the examples of ``contex.rules`` I have given are sufficiently intuitive so that any programmer can look at them and infer pretty accurately what they do, because the whole point of this endeavor is to increase readability. Furthermore, I'd be interested to see if other people can take this idea ``^\w{7}`` Using Contex ------------ The ``contex`` package contains 5 functions: - ``rules(regex, rule_dict, method=re.fullmatch, flags=0)`` for declarative string manipulation. - ``T(string)`` for converting a string into a ``StringContext`` object. - ``search(string, pattern, flags=0)`` and - ``match(string, pattern, flags=0)`` for regex searches (with the same semantic difference as in the ``re`` module). They both return a ``MatchContext`` object. - ``find(string, substring, right_side=False)`` for finding a substring, returns a ``StringContext`` object. ``contex`` also contains the ``StringContext`` and ``MatchContext`` classes. Installing ---------- ``contex`` should work in both Python 2.7 and 3. Install with ``$ pip install contex``. If you want to install for Python 3 you might want to replace ``pip`` with ``pip3``, depending on how your system is configured. Developing ---------- Contex is documented and tested. Run ``$ nosetests`` or ``$ python3 setup.py test`` to run the tests. The code is hosted at https://notabug.org/Uglemat/Contex License ------- The library is licensed under the GNU General Public License 3 or later. This README file is public domain. PKݓG(mcxxcontex-3.1.dist-info/RECORDcontex/analyze_regex.py,sha256=RsjDqS4KfUhrZQe1v9RNe9BoHRHTeOeh9dqGX9FoIuo,1835 contex/__init__.py,sha256=_-LnDOXLocpY59RATywnI-PloLvhsoQ8MXvTD2uWh48,22429 contex-3.1.dist-info/RECORD,, contex-3.1.dist-info/METADATA,sha256=TsUx2UWm837W7mOa_I5FiSpY_DOptRUTJ4cnc2gncOI,9315 contex-3.1.dist-info/WHEEL,sha256=-aSo8rHuuPDEFzkcqqQ55pDyCjy25bYMLxSiHWKAOTc,92 contex-3.1.dist-info/metadata.json,sha256=jTWJpSAOrn6SatP_ZziX8Qyn5aKMf4m4XatNM6rnQc4,735 contex-3.1.dist-info/top_level.txt,sha256=Jk2ojGh-mGli0p8efzIi5IM8OEc4Et-0TAiBfNtGaTo,7 contex-3.1.dist-info/DESCRIPTION.rst,sha256=hTU7UmLhk0bp2Yy8PlAPbCHbNZrcaaCTX4azDkvSeq4,8772 PKKFOv++contex/analyze_regex.pyPKGRoWW`contex/__init__.pyPKݓGD"D"$-_contex-3.1.dist-info/DESCRIPTION.rstPKݓGP"contex-3.1.dist-info/metadata.jsonPKݓGEU"҄contex-3.1.dist-info/top_level.txtPKݓG[C\\contex-3.1.dist-info/WHEELPKݓG2';zc$c$contex-3.1.dist-info/METADATAPKݓG(mcxxKcontex-3.1.dist-info/RECORDPKS