User:Cryptic/dup-publisher.py

!/usr/bin/env python3
#
#<nowiki>
"""Outputs a list of articles from a given dump file containing
"citation" or "cite [anything]" templates with duplicate values in
publisher and either encyclopedia, journal, magazine, newspaper,
series, title, website, or work parameters, or in journal and series
parameters.

For [[WP:RAQ#Find all instances of journal=publisher]] circa 28 June
2023.
"""

import sys
assert sys.version_info >= (3, 6), f"requires Python 3.6 or newer"

import argparse
import bz2
import os
import re
import shutil
import textwrap
import xml.sax

import mwparserfromhell


#############
# Arguments #
#############

_all_namespaces = False
HELP_ALL_NAMESPACES = "parse pages in all namespaces, not just article"

_count = None
HELP_COUNT = ("""output a running count of matched pages to stderr,
              updating every thousand pages read""")

_output = None
HELP_OUTPUT = "output file, a list of page titles; defaults to stdout"

_print_matches = False
HELP_PRINT_MATCHES = ("""output the page name, a tab, and the names of
                      the first set of matching template parameters
                      instead of just the page name""")


#################
# Other globals #
#################

# _rx_rough_match is used to eliminate pages from consideration before
# the expensive full parse; it's important that it have no false
# negatives.
_rx_rough_match = re.compile(r"{{\s*[cC]it(?:ation\b|e ).*(publisher|series)")
# target template name
_rx_template_name = re.compile(r"^[cC]it(?:ation$|e )")

_namespaces = {}    # maps namespace numbers to names
_matched_pages = 0  # count of pages w/at least one duplicate param pair


class _XMLHandler(xml.sax.ContentHandler):
    def __init__(self):
        super().__init__()
        self.ns = None
        self.title = None
        self.text = None
        self.tags = [None]
        self.namespace = None
        self.namespace_key = None

    def startElement(self, name, attrs):
        if name == "page":
            self.ns = None
            self.title = None
            self.text = None
            # These shouldn't be present in <page> tags anyway, but.
            self.namespace = None
            self.namespace_key = None
        elif name == "ns":
            self.ns = ""
        elif name == "title":
            self.title = ""
        elif name == "text":
            self.text = ""
        elif name == "namespace":
            self.namespace = ""
            self.namespace_key = int(attrs.get("key"))
        else:
            return

        self.tags.append(name)

    def endElement(self, name):
        if name == self.tags[-1]:
            self.tags.pop()

        if ((name == "page" and self.text is not None
             and self.ns is not None and self.title is not None)):
            process_page(int(self.ns), self.title, self.text)
        elif name == "namespace" and self.namespace_key is not None:
            _namespaces[self.namespace_key] = self.namespace + ":"

    def characters(self, content):
        if self.tags[-1] == "ns":
            self.ns += content
        elif self.tags[-1] == "title":
            self.title += content
        elif self.tags[-1] == "text":
            self.text += content
        elif self.tags[-1] == "namespace":
            self.namespace += content


def pagename(ns, title):
    """Return human-readable name of page title in numbered namespace ns"""
    if ns == 0:         # Special-case to omit the :
        return title
    elif ns in _namespaces:
        return _namespaces[ns] + ":" + title
    else:
        return "{{ns:" + str(ns) + "}}:" + title


def process_page(ns, title, text):
    """Filter ns:title (containing plaintext text) by namespace and
    _rx_rough_match, pass it through to has_dupe_cite_params() if
    appropriate, increment counters, and output
    """
    global _count, _matched_pages
    if (((_all_namespaces or ns == 0)
         and _rx_rough_match.search(text))):
        dupe = has_dupe_cite_params(text)
        if dupe is not None:
            _matched_pages += 1
            if _print_matches:
                print(pagename(ns, title) + "\t" + dupe, file=_output)
            else:
                print(pagename(ns, title), file=_output)

    if _count is not None:
        _count += 1
        if _count % 1000 == 0:
            print(f"Read {_count} pages, matched {_matched_pages}",
                  file=sys.stderr)


def has_dupe_cite_params(text):
    """If text contains a citation template with duplicate parameters
    we're looking for, return a string suitable for the print-matches
    option; else None
    """

    def errval(template, param1name, param2name, paramval):
        """Return a string suitable for the print-matches option"""
        return ("{{" + str(template.name).strip() + "}}:" + param1name + ","
                + param2name + '="' + paramval + '"')

    def param(template, param_name):
        """Return the wikicode of template's parameter param_name as a
        str, or None if empty or not present
        """
        par = template.get(param_name, default=None)
        if par is None:
            return None
        rval = str(par.value).strip()
        if rval == "":
            return None
        return rval

    parsed = mwparserfromhell.parse(text)
    templates = parsed.filter_templates()
    for t in templates:
        if _rx_template_name.match(str(t.name)):
            publisher = param(t, "publisher")
            if publisher is not None:
                for other in ("encyclopedia",
                              "journal",
                              "magazine",
                              "newspaper",
                              "series",
                              "title",
                              "website",
                              "work"):
                    if publisher == param(t, other):
                        return errval(t, "publisher", other, publisher)
            journal = param(t, "journal")
            if journal is not None and journal == param(t, "series"):
                return errval(t, "journal", "series", journal)
    return None


def _fill_paragraphs(text, width=None):
    """Returns text, wrapped as per textwrap.fill(), but preserve
    paragraph splits (as denoted by sequences of two newlines).
    """

    # width is pulled from argparse.HelpFormatter().__init__() to try
    # to match the default behavior - and hence option formatting - as
    # closely as practical.  Irritatingly, it changed in 3.8, which I
    # happened to notice by accident.
    #
    # It is infuriating that argparse neither publicizes its formatter
    # classes so they can be properly overridden, nor exposes width
    # determination so they can be reliably mimicked.  Oh well, if it
    # changes again, it's ok if *this* looks a little ugly, and it'll
    # break less badly than subclassing the private classes would.
    if width is None:
        if sys.version_info >= (3, 8):
            width = shutil.get_terminal_size().columns
        else:
            try:
                width = int(os.environ['COLUMNS'])
            except (KeyError, ValueError):
                width = 80
        width -= 2

    return "\n\n".join([textwrap.fill(s, width) for s in text.split("\n\n")])


def _main():
    args = argparse.ArgumentParser(description=_fill_paragraphs(__doc__),
                        # pylint: disable=bad-continuation
                        formatter_class=argparse.RawDescriptionHelpFormatter)
    args.add_argument("dumpfile",
                      help="input dump file, in xml or bzip2-compressed xml")
    args.add_argument("-a", "--all-namespaces",
                      action="store_true",
                      help=HELP_ALL_NAMESPACES)
    args.add_argument("-c", "--count",
                      action="store_true",
                      help=HELP_COUNT)
    args.add_argument("-m", "--print-matches",
                      action="store_true",
                      help=HELP_PRINT_MATCHES)
    args.add_argument("-o", "--output",
                      default=sys.stdout,
                      type=argparse.FileType("w", encoding="utf-8"),
                      help=HELP_OUTPUT)
    args = args.parse_args()

    global _all_namespaces, _count, _output, _matched_pages, _print_matches
    _all_namespaces = args.all_namespaces
    _count = 0 if args.count else None
    _print_matches = args.print_matches
    _output = args.output

    _matched_pages = 0

    with open(args.dumpfile, 'rb') as f:
        magic = f.read(3)
    if magic == b'\x42\x5a\x68':
        f = bz2.BZ2File(args.dumpfile)
    else:
        f = open(args.dumpfile, 'r', encoding='utf-8')

    xml.sax.parse(f, _XMLHandler())

    # don't print this if count's divisible by 1000 and > 0, since it
    # would duplicate the print in process_page()
    if _count is not None and (_count == 0 or _count % 1000 != 0):
        print(f"Read {_count} pages, matched {_matched_pages}",
              file=sys.stderr)


if __name__ == "__main__":
    _main()
#</nowiki>