User:PhotoCatBot/Src/StaleReqphotoBot

Source code for StaleReqphotoBot, which identifies {{reqphoto}} instances that appear likely to be "stale" (where the article page has one or more images) and adds them to a category of articles which may need to have the reqphoto template removed.
#! /usr/bin/python

# StaleReqphotoBot
#
# Examine each article that transcludes {{reqphoto}}.  If the
# main article page has at least one image, add the article
# to Category:Articles which may no longer need photos.
#
# Do not revisit any article more often than once every six
# months.  Skip articles with {{reqphoto|of=...}}.  Skip
# articles titled "National Register of Historic Places listing..."

import wikipedia, catlib, pagegenerators
import sqlite3
import time, sys
import re
import wikitemplate
import socket
from datetime import datetime, timedelta

startCatName = 'Category:Wikipedia requested photographs'
startCatAfter = None
hasImageCatName = 'Category:Articles which may no longer need images'
editComment = '[[User:PhotoCatBot|PhotoCatBot]] thinks this article may no longer need a photo request.  Please check and update the talk page!'

def main():
    diary = initialize_diary()

    site = wikipedia.getSite()

    # Find articles transcluding {{reqphoto}}
    photoreq_cat = catlib.Category(None, startCatName)
    photoreq_pages = pagegenerators.CategorizedPageGenerator(photoreq_cat, recurse = True, start = startCatAfter)
    for p in photoreq_pages:
        try:
            update_stale_reqphotos(diary, p)
        except (wikipedia.Error, socket.timeout):
            wikipedia.output("%s raised on %s" % (sys.exc_info(), p.title()))

def update_stale_reqphotos(diary, page):
    if page.isTalkPage():
        talk = page
        article = page.toggleTalkPage()
    else:
        article = page
        talk = page.toggleTalkPage()

    # Skip NRHP Listing articles per doncram.
    if article.title().startswith('National Register of Historic Places listing'):
        wikipedia.output("%s: skipping" % article.title())
        return

    # Skip this page if we have modified it in the last 6 months.
    if recently_updated(diary, talk):
        wikipedia.output("%s was updated within 6 months" % talk.title())
        return

    # If the page has a {{reqphoto}} with the "of=" paramter,
    # we'll assume it's a very specific photo request and
    # ignore it even if the page has images.  This is the way
    # to short-circuit the bot from re-adding a page inappropriately
    # to category 'Articles which may no longer need images'.
    reqphotos = find_reqphotos_on(talk)
    if reqphotos:
        reqphotos_have_of = any( param.find("of=") == 0
                                 for req in reqphotos
                                 for param in req[1] )
        if reqphotos_have_of:
            wikipedia.output("%s has {{reqphoto|of=}}, skipping" % article.title())
            return

    # If the article has an infobox *and* any infobox
    # does not have an image, skip it -- the image request
    # is assumed to still be legitimate in this case.
    # Suggestion by {{user|Emperor}}.
    infoboxes = find_infoboxes(article)
    if infoboxes:
        infoboxes_lacking_image = filter(infobox_lacks_image, infoboxes)
        if infoboxes_lacking_image:
            wikipedia.output("skipping %s: {{%s}} lacks an image" % (article.title(), infoboxes_lacking_image[0][0]))
            return

    # If this article appears to contain images, add it to
    # 'Articles which may no longer need images'.
    if has_images(article):
        text = talk.get()
        cats = talk.categories()
        hasImageCat = catlib.Category(None, hasImageCatName, sortKey = article.title())

        if hasImageCat in cats:
            wikipedia.output("%s already in %s, skipping" % (talk.title(), hasImageCatName))
            return
        else:
            newtext = wikipedia.replaceCategoryLinks(text, cats + [hasImageCat])
            if text != newtext:
                try:
                    #talk.put(newtext, editComment)
                    wikipedia.showDiff(text, newtext)
                except:
                    wikipedia.output("could not save %s: %s" % (talk.title(), sys.exc_info()))
                #update_modification_time(diary, talk)

def find_reqphotos_on(page):
    reqphotos = [tmpl for tmpl in page.templatesWithParams() if tmpl[0] == 'Reqphoto']
    return reqphotos

def find_infoboxes(page):
    infoboxes = [tmpl for tmpl in page.templatesWithParams() if tmpl[0].startswith('Infobox')]
    return infoboxes

def nonempty_image_param(param):
    return re.match(r'image\s*=.*\.(jpg|png)', param, re.I | re.M)

def infobox_lacks_image(template):
    parameters = template[1]
    return not any(nonempty_image_param(p) for p in parameters)

# Check to see if the page includes a JPG, GIF or PNG image.
# Skip .SVG because it is so often used for maps, logos, icons,
# placeholders and other small art that is not intended by the reqphoto
# template.
def has_images(page):
    try:
        images = page.imagelinks()
    except:
        wikipedia.output("%s raised on %s" % (sys.exc_info(), page.title()))
        return None

    return any(re.match(r'.*\.(jpg|jpeg|gif|png)$', img.title(), re.I) for img in images)

def initialize_diary():
    db = sqlite3.connect('StaleReqphotoBot.sqlite3', detect_types = sqlite3.PARSE_DECLTYPES)
    c = db.cursor()
    c.execute("""CREATE TABLE IF NOT EXISTS update_times (
                   title       TEXT      PRIMARY KEY,
                   update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                 )""")
    db.commit()
    c.close()
    return db

def update_modification_time(db, page):
    c = db.cursor()
    args = (page.title(), )
    c.execute('INSERT OR REPLACE INTO update_times (title) VALUES (?)', args)
    db.commit()
    c.close()

def recently_updated(db, page):
    c = db.cursor()
    args = (page.title(), )
    c.execute('SELECT update_time FROM update_times WHERE title = ?', args)
    r = c.fetchone()
    c.close()
    if r:
        expire_time = r[0] + timedelta(180);
        return datetime.now() < expire_time
    else:
        return False

def close_diary(db):
    db.disconnect()

try:
    main()
finally:
    wikipedia.stopme()