Source code for articlequality.extractors.extractor

"""
.. autoclass:: articlequality.Extractor
    :members:

.. autoclass:: articlequality.TemplateExtractor
    :members:
    :inherited-members:
"""

import logging
import sys
import traceback
from collections import OrderedDict

import mwparserfromhell as mwp
import mwreverts

logger = logging.getLogger(__name__)


[docs]class Extractor: """ Implements an labeling event extraction strategy. :Parameters: name : `str` A name for the extraction strategy doc : `str` Documentation describing the extraction strategy namespace : `iterable`(`int`) A set of namespaces that will be considered when performing an extraction """ def __init__(self, name, doc, namespaces): self.__name__ = str(name) self.__doc__ = str(doc) self.namespaces = set(namespaces)
[docs] def extract(self, page, verbose=False): """ Processes an :class:`mwxml.Page` and returns a generator of first-observations of a project/label pair. :Parameters: page : :class:`mwxml.Page` Page to process verbose : `bool` print dots to stderr """ if page.namespace not in self.namespaces: pass else: if verbose: sys.stderr.write("\n{0}: ".format(page.title)) sys.stderr.flush() revisions = OrderedDict() detector = mwreverts.Detector() # Process all of the revisions looking for reverts for revision in page: revert = detector.process(revision.sha1, revision.id) try: revision_text = revision.text or "" project_labels = set(pl for pl in self.extract_labels(revision_text)) except Exception: logger.warning("Could not extract labels from text:") logger.warning(traceback.format_exc()) continue revisions[revision.id] = { 'id': revision.id, 'timestamp': revision.timestamp, 'was_reverted': False, 'is_a_revert': revert is not None, 'reverted': revert.reverteds if revert is not None else [], 'project_labels': project_labels } if revert is not None: # This revision is a revert. self.invert_reverted_status( revisions[revision.id]['reverted'], revisions) # Re-process revisions only considering those that were not # reverted last_labels = set() for rev_id, revision in revisions.items(): if revision['was_reverted']: if verbose: sys.stderr.write("r") sys.stderr.flush() continue # Get the new labels new_labels = revision['project_labels'] - last_labels last_labels = revision['project_labels'] # Log some verbose stuff if verbose: if len(new_labels) > 0: sys.stderr.write("l") else: sys.stderr.write(".") sys.stderr.flush() for project, label in new_labels: yield {'rev_id': revision['id'], 'timestamp': revision['timestamp'], 'project': project, 'wp10': label}
[docs] def invert_reverted_status(self, reverteds, revisions): """ This method recursively searches the reverted status of revisions and inverts the status when reverts are themselves reverted. """ for rev_id in reverteds: revisions[rev_id]['was_reverted'] = \ not revisions[rev_id]['was_reverted'] if revisions[rev_id]['is_a_revert']: self.invert_reverted_status(revisions[rev_id]['reverted'], revisions)
def extract_labels(self, text): raise NotImplementedError()
[docs]class TemplateExtractor(Extractor): """ Implements a template-based extraction strategy based on a `from_template` function that takes a template and returns a (project, label) pair. :Parameters: from_template : `func` A function that takes a template and returns a (project, label) pair """ def __init__(self, *args, from_template, **kwargs): self.from_template = from_template if 'filter_text' in kwargs: self.filter_text = kwargs.get('filter_text') kwargs.pop('filter_text', None) super().__init__(*args, **kwargs)
[docs] def extract_labels(self, text): """ Extracts a set of labels for a version of text by parsing templates. :Parameters: text : `str` Wikitext markup to extract labels from :Returns: An iterator over (project, label) pairs """ # filter_text is an initial fast pass to weed out wikitext that # can't contain the template (eg. because the template name # never appears) if hasattr(self, 'filter_text'): if not self.filter_text(text): return parsed_text = mwp.parse(text) templates = parsed_text.filter_templates() for template in templates: yield from self.from_template(template)