Jump to content

User:OrphanBot/libPearle2.pl

From Wikipedia, the free encyclopedia

### IMPORTANT ###

# This code is released into the public domain.

### RECENT CHANGES ###

# 30 Nov 2005: Created, based off of the  12 Nov 2005 version of Pearle Wisebot
# 15 Feb 2006: Modifed "retry" to work with any function that signals failure by dying, modified to use a simple exponential backoff formula
#              Simplified "limit" code, modified to take an optional parameter
#              Added "config" function as a clean interface to change internal parameters
#              Modified Wiki-access functions for use with the new "retry" function
#              Cleanup of boolean config vars to use standard Perl boolean conventions
# 28 Feb 2006: Added checkLogin bottleneck, option to allow editing while logged out
#              Added support for proxy servers
#  8 Mar 2006: Added support for getting a user's contributions
#              Added support for retrieving logs
#              Separated out some common regex parts into variables
# 29 Mar 2006: Added protection against Unicode in URLs
#              Made thrown exceptions consistent
#              Sanity-checking on postPage: talkpage without article, userpage or user talkpage without user
# 17 May 2005: Improved log retrieval
# 12 Jul 2007: Added timestamp to information retrieved from logs

# Errors thrown by this package always begin with a three-digit number
#     4xx: HTTP client errors
#     505: Server error: HTTP version not supported
#     509: Server error: Bandwidth exceeded
#
#     900: Unspecified internal error.
#     901: Library not initialized.  You didn't call Pearle::init() before calling this function.
#     902: Parameter error.  You made a function call, but forgot a mandatory parameter, or provided an invalid one.
#
#     920: Unexpected response.  The MediaWiki site returned something unexpected.
#     921: Unexpected logout.  The MediaWiki site logged us out unexpectedly.
#     922: Edit conflict.  Someone edited the article while we were.
#     923: Deleted article conflict.  Someone deleted the article while we were editing.

package Pearle;

use strict;
use warnings;

use Time::HiRes;

use utf8;

use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Request::Common qw(POST);
use HTML::Entities;

# Standard regex parts
$Pearle::regex_timestamp = '(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d)';				# Match and capture a Wikipedia timestamp
$Pearle::regex_timestamp_nc = '\d\d:\d\d, \d\d? \w+ \d\d\d\d';						# Match a Wikipedia timestamp

#<a href="/w/index.php?title=User:Angel_dunn&action=edit" class="new" title="User:Angel dunn">
#<a href="/wiki/User:Jimbo_Wales" title="User:Jimbo Wales">
$Pearle::regex_pagelink = '<a href="[^"]*"(?: class="new"|) title="([^"]*)">';	# Match and capture any page
$Pearle::regex_redpagelink = '<a href="[^"]*" class="new" title="([^"]*)">';	# Match and capture nonexistant pages only
$Pearle::regex_bluepagelink = '<a href="[^"]*" title="([^"]*)">';				# Match and capture existing pages only
$Pearle::regex_pagelink_nc = '<a href="[^"]*"(?: class="new"|) title="[^"]*">';	# Match any page
$Pearle::regex_redpagelink_nc = '<a href="[^"]*" class="new" title="[^"]*">';	# Match nonexistant pages only
$Pearle::regex_bluepagelink_nc = '<a href="[^"]*" title="[^"]*">';				# Match existing pages only

# Standard MediaWiki namespaces
@Pearle::namespaces = ("", "Talk", "User", "User talk", "Wikipedia", "Wikipedia talk", "Image", "Image talk", "MediaWiki", "MediaWiki talk", "Template", "Template talk", "Help", "Help talk", "Category", "Category talk");

$Pearle::logfile = "";
$Pearle::_inited = 0;
$Pearle::username = "";
$Pearle::password = "";
$Pearle::speedLimit = 10;	# Seconds to wait by default when limit() is called
$Pearle::_speedMult = 1;	# Multiplier for default wait time if the wiki is being slow
$Pearle::roughMode = 0;		# Ignore most errors
$Pearle::nullOK = 0;		# Permit editing non-existent pages
$Pearle::sanityCheck = 0;	# Sanity checking on edits
$Pearle::silent = 0;		# Silent mode
$Pearle::quiet = 0;			# Quiet mode
$Pearle::logoutOK = 0;		# Permit editing while logged out
$Pearle::proxy = undef;		# Proxy to use

# This must be the first function from the library called
sub init
{
	$Pearle::username = $_[0] or die("902 No username provided!\n");
	$Pearle::password = $_[1] or die("902 No password provided!\n");
	$Pearle::logfile = $_[2] or die("902 No logfile name provided!\n");
	$Pearle::cookies = $_[3] or die("902 No cookie file provided!\n");
	$Pearle::useragent = $_[4] or $Pearle::useragent = "PearleLib/0.2";
	

	$Pearle::ua = LWP::UserAgent->new(timeout => 300);
	$Pearle::ua->agent($Pearle::useragent);
	$Pearle::ua->cookie_jar(HTTP::Cookies->new(file => $Pearle::cookies, autosave => 1));
	$Pearle::ua->cookie_jar->load();

	$Pearle::roughMode = "no";

	# Hot pipes
	$| = 1;
	
	$Pearle::_inited = 1;
}

sub config
{
	my %params = @_;
	$Pearle::speedLimit = $params{speedLimit} if(defined($params{speedLimit}));
	$Pearle::roughMode = $params{roughMode} if(defined($params{roughMode}));
	$Pearle::nullOK = $params{nullOK} if(defined($params{nullOK}));
	$Pearle::silent = $params{silent} if(defined($params{silent}));
	$Pearle::quiet = $params{quiet} if(defined($params{quiet}));
	$Pearle::logfile = $params{logfile} if(defined($params{logfile}));
	$Pearle::logoutOK = $params{logoutOK} if(defined($params{logoutOK}));
	$Pearle::sanityCheck = $params{sanityCheck} if(defined($params{sanityCheck}));
	
	if(exists($params{proxy}))
	{
		if(defined($params{proxy}))
		{
			myPrint("Proxying: $params{proxy}\n");
			myLog("Proxying: $params{proxy}\n");
			$Pearle::ua->proxy('http', $params{proxy});
			$Pearle::proxy = $params{proxy};
		}
		else
		{
			myPrint("Not proxying\n");
			myLog("Not proxying\n");
			$Pearle::ua->no_proxy();
			$Pearle::proxy = undef;
		}
	}
}

sub myLog
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

    open (LOG, ">>", $Pearle::logfile) 
	|| die "900 Could not append to log!";
    print LOG $_[0];
    close (LOG);
}

sub myPrint
{
	return if($Pearle::silent);
	return if($Pearle::quiet);
	
	print @_;
}

sub myErrPrint
{
	return if($Pearle::silent);
	return if($Pearle::quiet);
	
	print STDERR @_;
}

# Rate-limiting.  Can be sensibly run even if libPearle isn't initialized
sub limit
{
	my ($i);
	$i = ($_[0] or ($Pearle::speedLimit * $Pearle::_speedMult));
	$i = 10 if($i < 10);

	# Rate-limiting to avoid hosing the wiki server
	# Min 30 sec unmarked
	# Min 10 sec marked
	# May be raised by retry() if load is heavy

	### ATTENTION ###
	# Increasing the speed of the bot to faster than 1 edit every 10
	# seconds violates English Wikipedia rules as of April, 2005, and
	# will cause your bot to be banned.  So don't change $normalDelay
	# unless you know what you are doing.  Other sites may have
	# similar policies, and you are advised to check before using your
	# bot at the default speed.
	#################

	while ($i >= 0)
	{
		sleep (1);
		myErrPrint("Sleeping $i seconds...\r");
		$i--;
	}
	myErrPrint("                                   \r");
}

sub login 
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my $res = $Pearle::ua->post(
		"http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogin&action=submitlogin",
		Content => [
			wpName         => $Pearle::username,
			wpPassword     => $Pearle::password,
			wpRemember     => 1,
			wpLoginAttempt => 1
		]
	);

	if( 302 == $res->code )
	{
		myPrint("Logged in as $Pearle::username\n");
		myLog("Logged in as $Pearle::username\n");
		# This may or may not actually work
		$Pearle::ua->cookie_jar->save();
		return 1;
	}
	else
	{
		myPrint("Login failed\n");
		myPrint("Code: ".$res->code."\n");
		myLog("Login failed\n");
		return 0;
	}
}

sub logout {
        my $res = $Pearle::ua->post(
                "http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogout",
        );

        return 1;
}

sub checkLogin
{
	my ($reply_text);
	$reply_text = $_[0];
	
	if ($reply_text !~ m/>My talk<\/a>/ and !($Pearle::logoutOK))
	{
		# We've lost our identity.
		myLog ("Wiki server is not recognizing me.\n");
		die ("921 Wiki server is not recognizing me.\n");
	}
}

# Make an HTTP request, performing basic error checking and handling.  Suitable for use with the "retry" function
sub httpRequest
{
	my ($request, $response, $attemptStartTime, $attemptEndTime);
	$request = $_[0];
	
	# Since not every server handles UTF-8 in URLs, and LWP doesn't escape them properly, escape every character > 255
	$request->uri(unicodeToUrl($request->uri()));
	
	$response = $Pearle::ua->request($request);

	# Monitor wiki server responsiveness
	$attemptStartTime = Time::HiRes::time();

	if ($response->is_success or $response->is_redirect)
	{
		return $response
	} 
	else 
	{
		myLog ("HTTP ERR (".$response->status_line.")\n".$response->content."\n");
		myPrint("HTTP ERR (".$response->status_line.")\n".$response->content."\n");
		# 50X HTTP errors mean there is a problem connecting to the wiki server.  Can be remedied by waiting and trying again
		if (500 <= $response->code and 504 >= $response->code)
		{
			die("retry:".$response->status_line);
		}
		else
		{
			# Unhandled HTTP response.  Waiting probably won't fix it
			die ($response->status_line."\n");
		}
	}
	# Monitor wiki server responsiveness
	$attemptEndTime = Time::HiRes::time();

	if($request->method() eq "POST")
	{
		if (($attemptEndTime - $attemptStartTime) > 20)
		{
			$Pearle::_speedMult = 60;

			myPrint("Wikipedia is very slow.  Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
			myLog("Wikipedia is very slow.  Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
		}

		# If the response time is between 10 and 20 seconds...
		elsif (($attemptEndTime - $attemptStartTime) > 10)
		{
			$Pearle::_speedMult = 6;

			myPrint("Wikipedia is somewhat slow.  Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
			myLog("Wikipedia is somewhat slow.  Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
		}

		# If the response time is less than 10 seconds...
		else
		{
			if ($Pearle::_speedMult != 1)
			{
				$Pearle::_speedMult = 1;

				myPrint( "Returning to normal minimum wait time.\n");
				myLog("Returning to normal minimum wait time.\n");
			}
		}
	}
}

# Check out a page for editing.
sub getPage
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($target, $request, $response, $reply, $text, $text2,
	$editTime, $startTime, $attemptStartTime, $attemptFinishTime,
	$token, $targetSafe);

	$target = $_[0];

	if ($target =~ m/^\s*$/)
	{
		myLog("getPage: Null target.");
		die("902 getPage: Null target.");
	}

	$targetSafe = $target;
	$targetSafe =~ s/\&/%26/g;
	$targetSafe =~ s/\+/%2B/g;

	# Create a request-object
	myPrint("GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n");
	myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n");
	$request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit");
	$response = startRetry(\&httpRequest, $request);

	$reply = $response->content;

	# This detects whether or not we're logged in.
	checkLogin($reply);

	# Check for blocking
	if($reply =~ /<h1 class="firstHeading">User is blocked<\/h1>/)
	{
		myLog("Blocked\n");
		die("900 Blocked");
	}
	
	$reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s;
	$text = $1;

	$reply =~ m/value="(\d+)" name="wpEdittime"/;
	$editTime = $1;

	# Added 22 Aug 2005 to correctly handle articles that have
	# been undeleted
	$reply =~ m/value="(\d+)" name="wpStarttime"/;
	$startTime = $1;

	# Added 9 Mar 2005 after recent software change.
	$reply =~ m/value="([^"]+)" name="wpEditToken"/;
	$token = $1;
	###

	if (($text =~ m/^\s*$/)	and !$Pearle::nullOK)
	{
		myLog ("getPage($target): Null text!\n");
		myLog ("\n---\n$reply\n---\n");
		if ($Pearle::roughMode)
		{
			return;
		}
		else
		{
			die ("920 getPage($target): Null text!\n");
		}
	}

	if (($editTime =~ m/^\s*$/) and !$Pearle::nullOK)
	{
		myLog ("getPage($target): Null time!\n");
		myLog("\n---\n$reply\n---\n");
		die ("920 getPage($target): Null time!\n");
	}

	if (($text =~ m/>/) or ($text =~ m/</))
	{
		myPrint($text);
		myLog("\n---\n$text\n---\n");
		myLog ("getPage($target): Bad text suck!\n");
		die ("920 getPage($target): Bad text suck!\n");
	}

	# Change ( " -> " ) etc
	# This function is from HTML::Entities.
	decode_entities($text);

	# This may or may not actually work
	$Pearle::ua->cookie_jar->save();

	return ($text, $editTime, $startTime, $token);
}


sub postPage
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($request, $response, $pageName, $textToPost, $summaryEntry,
	$editTime, $startTime, $actual, $expected, $date, $editToken,
	$minor, $pageNameSafe);

	$pageName = $_[0];
	$editTime = $_[1];
	$startTime = $_[2];
	$editToken = $_[3];
	$textToPost = $_[4];
	$summaryEntry = $_[5]; # Max 200 chars!
	$minor = $_[6];

	$summaryEntry = substr($summaryEntry, 0, 200);

	if ($pageName eq "")
	{
		myLog ("postPage(): Empty pageName.\n"); 
		die ("902 postPage(): Empty pageName.\n"); 
	}
	
	if(!defined($minor))
	{
		die "902 postPage(): Not enough parameters!\n";
	}

	if ($summaryEntry eq "")
	{
		$summaryEntry = "Automated editing.";
	}

	$pageNameSafe = $pageName;
	$pageNameSafe =~ s/\&/%26/g;
	$pageNameSafe =~ s/\+/%2B/g;

	if ($minor eq "yes")
	{
		$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit",
		[wpTextbox1 => $textToPost,
		wpSummary => $summaryEntry,
		wpSave => "Save page",
		wpMinoredit => "on",
		wpEditToken => $editToken,
		wpStarttime => $startTime,
		wpEdittime => $editTime];
		# Optional: wpWatchthis
	}
	else
	{
		$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit",
		[wpTextbox1 => $textToPost,
		wpSummary => $summaryEntry,
		wpSave => "Save page",
		wpEditToken => $editToken,
		wpStarttime => $startTime,
		wpEdittime => $editTime];
		# Optional: wpWatchthis, wpMinoredit
	}

	# ---
	## If posts are failing, you can uncomment the below to see what
	## HTTP request is being made.
	# myLog($request->as_string());
	# print $request->as_string();	$::speedLimit = 60 * 10;
	# print $::ua->request($request)->as_string;
	# ---

	myLog("POSTing...");
	myPrint("POSTing...");
	# Pass request to the user agent and get a response back
	$response = startRetry(\&httpRequest, $request);
	myLog("POSTed.\n");
	myPrint("POSTed.\n");


	if ($response->content =~ m/Please confirm that really want to recreate this article./)
	{
		myLog ($response->content."\n");
		die ("923 Deleted article conflict! See log!");
	}


	# Check the outcome of the response
	$response->code;
	if ($response->code != 302 and $response->code  != 200)
	{
		myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n");
		myLog ($request->as_string());
		myLog ("EXPECTED: 302'\n");
		myLog ("  ACTUAL: '" . $response->status_line . "'\n");

		if ($Pearle::roughMode eq "yes")
		{
			return();
		}
		else
		{
			die ("920 postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n");
		}
	}

	$expected = "http://en.wikipedia.org/wiki/${pageName}";
	$expected = Pearle::urlEncode($expected);

	$actual = $response->headers->header("Location");


	if (($expected ne $actual) and ($Pearle::roughMode ne "yes") 
	    and !(($actual eq "") and ($response->code == 200)))
	{
		myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n");
		myLog ("EXPECTED: '${expected}'\n");
		myLog ("  ACTUAL: '${actual}'\n");
		die ("920 postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n");
	}


	if ($response->content =~ m/<h1 class="firstHeading">Edit conflict/)
	{
		myLog ("Edit conflict on '$pageName' at '$editTime'!\n");
		die ("922 Edit conflict on '$pageName' at '$editTime'!\n");
	}
	
	if($Pearle::sanityCheck and $pageName =~ /talk[ _]*:/i)	# Check for accidental creation of a talkpage without a mainpage.  Only works with bots using the "monobook" skin.
	{
		# Monobook:<li id="ca-nstab-main" class="new"><a href="/w/index.php?title=Kjsahfjrf&action=edit">Article</a></li> 
		# Classic: <br /><a href="/w/index.php?title=Kjsahfjrf&action=edit" class="new" title="Kjsahfjrf">View article</a>
		if($response->content =~ /<li id="ca-nstab-[^"]" class="new">/)
		{
			myLog ("postPage(${pageName}) - Talkpage without article!\n");
			die ("920 postPage(${pageName}) - Talkpage without article!\n");
		}
	}
	
	if($Pearle::sanityCheck and $pageName =~ /^user[ _]*talk[ _]*:/)	# Check for user talkpage for non-existant user
	{
		if($response->content !~ /User contributions/)
		{
			myLog ("postPge(${pageName}) - User talkpage for non-existant user!\n");
			die ("920 postPge(${pageName}) - User talkpage for non-existant user!\n");
		}
	}

	$Pearle::ua->cookie_jar->save();
	return ($response->content);
}


# Get a list of the names of articles in a given category.
sub getCategoryArticles
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($target, $request, $response, $reply, $articles, $article,
	@articles, 	$targetSpace, $offset, $numberOfArticles, $url,
	@moreArticles);


	$target = $_[0];
	$offset = $_[1];

	# Need both _ and spaces for precise matching later
	$target =~ s/ /_/g;
	$targetSpace = $target;
	$targetSpace =~ s/_/ /g;

	unless ($target =~ m/^Category:/)
	{
		myLog ("getCategoryArticles(): Are you sure '$target' is a category?\n");
		die ("902 getCategoryArticles(): Are you sure '$target' is a category?\n");
	}

	if ($offset eq "")
	{
		$url = "http://en.wikipedia.org/wiki/${target}";
	}
	else
	{
		$url = "http://en.wikipedia.org/w/index.php?title=${target}&from=${offset}";
	}

	# Create a request-object
	if ($offset eq "")
	{
		myPrint("GET ${url}\n");
	}
	myLog("GET ${url}\n");
	$request = HTTP::Request->new(GET => "${url}");
	$response = startRetry(\&httpRequest, $request);

	$reply = $response->content;

	# This detects whether or not we're logged in.
	checkLogin($reply);

	$articles = $reply;
	$articles =~ s%^.*?<h2>Articles in category.*?</h2>%%s;
	$articles =~ s%<div class="printfooter">.*?$%%s;
	@articles = $articles =~ m%<li><a href="/wiki/(.*?)" title=%sg;

	if ($reply =~ m/<a\s+href=\"\/w\/index.php\?title=${target}\&from=(.*?)\"\s+title=\"${targetSpace}\">next 200<\/a>/s)
	{
		sleep (1); # Throttle GETs
		@moreArticles = getCategoryArticles($target, $1);
		@articles = (@articles, @moreArticles);
	}

	$Pearle::ua->cookie_jar->save();

	$numberOfArticles = @articles;

	if ($offset eq "")
	{
		myPrint("Got $numberOfArticles articles.\n");
		myLog ("Got $numberOfArticles articles.\n");
	}
	return decodeArray(@articles);
}

sub getCategoryImages
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($target, $from, $request, $response, $reply, $images, @images,
	$image, %imagesHash);


	$target = $_[0];
	$from = $_[1];

	unless ($target =~ m/^Category:/)
	{
		myLog ("getCategoryImages(): Are you sure '$target' is a category?\n");
		die ("902 getCategoryImages(): Are you sure '$target' is a category?\n");
	}

	# Create a request-object
	if(!defined($from))	# Default: Start at the beginning of a category
	{
		myPrint("GET http://en.wikipedia.org/wiki/${target}\n");
		myLog("GET http://en.wikipedia.org/wiki/${target}\n");
		$request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}");
	}
	else	# Start somewhere middle-ish
	{
		myPrint("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&from=$from\n");
		myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&from=$from\n");
		$request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${target}\&from=$from");
	}
	$response = startRetry(\&httpRequest, $request);
	$reply = $response->content;

	# This detects whether or not we're logged in.
	checkLogin($reply);
#	unless ($reply =~ m%<a href="/wiki/User_talk:$Pearle::username">My talk</a>%)
#	{
#		# We've lost our identity.
#		myLog ("Wiki server is not recognizing me (2).\n---\n${reply}\n---\n");
#		die ("Wiki server is not recognizing me (2).\n");
#	}

	$images = $reply;
	$images =~ s/^.*?<table class="gallery"//s;
	$images =~ s/<div class="printfooter">.*?$//s;
	@images = $images =~ m/<a\s+href="\/wiki\/(.*?)"\s+title=\"Image:/g;

	@images = grep {$_ =~ /^Image:/} @images;

	if($images =~ /&from=([^"]+)" title="Category:[^"]*">next 200/)
	{
		print "More: $1\n";
		@images = (@images, getCategoryImages($target, $1));
	}

	# Uniqify to prevent duplicates
	@images = uniquify(@images);

	$Pearle::ua->cookie_jar->save();
	return decodeArray(@images);
}



sub getSubcategories
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($target, $request, $response, $reply, $subcats, $subcat,
	@subcats, $attemptStartTime, $attemptFinishTime);

	$target = $_[0];

	unless ($target =~ m/^Category:/)
	{
		myLog ("getSubcategories(): Are you sure '$target' is a category?\n");
		die ("902 getSubcategories(): Are you sure '$target' is a category?\n");
	}

	# Create a request-object
	myPrint("GET http://en.wikipedia.org/wiki/${target}\n");
	myLog("GET http://en.wikipedia.org/wiki/${target}\n");
	$request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}");
	$response = startRetry(\&httpRequest, $request);
	$reply = $response->content;

	# This detects whether or not we're logged in.
	checkLogin($reply);

	$subcats = $reply;

	if ($subcats =~ m%^.*?<h2>Subcategories</h2>(.*?)<h2>Pages in category.*?</h2>.*?$%s)
	{
		$subcats =~ s%^.*?<h2>Subcategories</h2>(.*?)<h2>Pages in category.*?</h2>.*?$%$1%s;
	}
	else
	{
		return ();
	}

	@subcats = $subcats =~ m%<li><a href="/wiki/(.*?)" title=%sg;

	$Pearle::ua->cookie_jar->save();
	return decodeArray(@subcats);
}

# Get up to $max most recent articles edited by a user
sub getUserArticles
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($url, $request, $response, $reply, @contribs,
	    $target, $namespace, $max, $offset);
	
	$target = $_[0];
	$max = $_[1];
	$offset = $_[2];
	$namespace = namespaceToNumber($_[3]);

	# Create a request-object
	if(defined($namespace))
	{
		$url = "http://en.wikipedia.org/w/index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}&namespace=$namespace";
	}
	else
	{
		$url = "http://en.wikipedia.org/w/index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}";
	}

	myPrint("GET $url\n");
	myLog("GET $url\n");
	$request = HTTP::Request->new(GET => "$url");
	$response = startRetry(\&httpRequest, $request);
	$reply = $response->content;

	# This detects whether or not we're logged in.
	checkLogin($reply);
		
	# Extract the contributions
	# <li>23:18, 6 March 2006 (<a href="/w/index.php?title=User_talk:OrphanBot&action=history" title="User talk:OrphanBot">
	while($reply =~ /<li>$Pearle::regex_timestamp_nc \($Pearle::regex_bluepagelink/g)
	{
		push @contribs, $1;
	}
	
	# Remove duplicates	
#	@contribs = uniquify(@contribs);
	return @contribs;
}

# Gets a list of (articles, actor, summary) tuples from the specified log (upload, delete, move, protect)
sub getLogArticles
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($url, $request, $response, $reply, @articles,
	    $log, $max, $offset, $user);
	
	
	$log = $_[0];
	$max = $_[1] || 50;
	$offset = $_[2] || 0;
	$user = $_[3] || '';

	# Create a request-object
	# http://en.wikipedia.org/w/index.php?title=Special:Log&type=upload&user=&page=&limit=2000&offset=0
	$url = "http://en.wikipedia.org/w/index.php?title=Special%3ALog&limit=${max}&offset=${offset}&user=${user}&type=${log}";

	myPrint("GET $url\n");
	myLog("GET $url\n");
	$request = HTTP::Request->new(GET => "$url");
	$response = startRetry(\&httpRequest, $request);
	$reply = $response->content;

	# This detects whether or not we're logged in.
	checkLogin($reply);
		
	# Extract the articles
	#<li>19:55, 7 March 2006 <a href="/wiki/User:Jimbo_Wales" title="User:Jimbo Wales">Jimbo Wales</a> deleted "<a href="/w/index.php?title=Image:Justinsfriends.jpg&action=edit" class="new" title="Image:Justinsfriends.jpg">Image:Justinsfriends.jpg</a>"  <span class='comment'>(blatant copyvio)</span> </li>
	#<li>19:54, 7 March 2006 <a href="/wiki/User:MrD9" title="User:MrD9">MrD9</a> moved <a href="/w/index.php?title=Statsoft&redirect=no" title="Statsoft">Statsoft</a> to <a href="/wiki/StatSoft" title="StatSoft">StatSoft</a>  (<a href="/w/index.php?title=Special:Movepage&wpOldTitle=StatSoft&wpNewTitle=Statsoft&wpReason=revert&wpMovetalk=0" title="Special:Movepage">revert</a>)</li>
	#<li>19:53, 7 March 2006 <a href="/w/index.php?title=User:Biederman&action=edit" class="new" title="User:Biederman">Biederman</a> uploaded "<a href="/wiki/Image:Rockingham_Raymond_NH.PNG" title="Image:Rockingham Raymond NH.PNG">Image:Rockingham Raymond NH.PNG</a>"  <span class='comment'>(Changed Image:Rockingham_Portsmouth_NH.PNG to highlight Raymond )</span> </li>
	#<li>19:31, 7 March 2006 <a href="/wiki/User:Francs2000" title="User:Francs2000">Francs2000</a> protected <a href="/wiki/Manoeuvre.org" title="Manoeuvre.org">Manoeuvre.org</a>  <span class='comment'>({{deletedpage}} [edit=sysop:move=sysop])</span> </li>
	#<li>19:30, 7 March 2006 <a href="/wiki/User:Tony_Sidaway" title="User:Tony Sidaway">Tony Sidaway</a> unprotected <a href="/wiki/Will_McWhinney" title="Will McWhinney">Will McWhinney</a>  <span class='comment'>(This looks like the protection that time forgot.)</span> </li>


#	while($reply =~ /<li>$Pearle::regex_timestamp_nc ${Pearle::regex_pagelink}.*?<\/a> (?:deleted|moved|uploaded|protected|unprotected) "?${Pearle::regex_pagelink}.*?<\/a>"(?:\s*<span class='comment'>(.*)<\/span>|)/g)
	while($reply =~ /<li>($Pearle::regex_timestamp_nc) ${Pearle::regex_pagelink}.*?<\/a> \(${Pearle::regex_pagelink_nc}Talk<\/a> \| ${Pearle::regex_pagelink_nc}contribs<\/a>\) (?:deleted|moved|uploaded|protected|unprotected) "?${Pearle::regex_pagelink}.*?<\/a>"(?:\s*<span class="comment">(.*)<\/span>|)/g)
	{
		my $summary = $3 || '';
		push @articles, [$3, $2, $summary, $1];
	}
	@articles = uniquify_ref1(@articles);
	return @articles;
}

# Use the Special:Export interface to get the wikitext of one or more articles
sub Export
{
	my ($request, $response, $reply, $articles);
	
	$articles = join "\n", @_;
	
	$request = POST "http://en.wikipedia.org/w/index.php?title=Special:Export&action=submit", [action => 'submit', pages => $articles, curonly => 1];
	$response = startRetry(\&httpRequest, $request);
	$reply = $response->content;

	return $reply;
}

# Do a null edit to an article
sub nullEdit
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($text, $articleName, $comment, $editTime, $startTime, $token);

	$articleName = $_[0];

	myPrint("nullEdit($articleName)\n");
	myLog ("nullEdit($articleName)\n");

	($text, $editTime, $startTime, $token) = getPage($articleName);
	unless ($text eq "")
	{
		postPage ($articleName, $editTime, $startTime, $token, $text, "null edit");
	}
}

# Get the history of an article and parse the first 500 entries into a list of [link day month year] lists
sub parseHistory
{
    my ($pageName, $html, @lines, $line, $date, $hour, $minute, $day, $month, $year,
	$htmlCopy, $link, $user, @result);

    $pageName = $_[0];
	$pageName = escapeUrl($pageName);
	
    $html = getURL("http://en.wikipedia.org/w/index.php?title=${pageName}&action=history&limit=500");

    $htmlCopy = $html;

    $html =~ s%^.*?<ul id="pagehistory">%%s;
    $html =~ s%(.*?)</ul>.*$%$1%s;

    @lines = split ("</li>", $html);
    foreach $line (@lines)
    {
		$line =~ s/\n/ /g;

		if ($line =~ m/^\s*$/)
		{
	    	next;
		}
		($user) = $line =~ /<span class='history-user'><a href=[^>]*>([^<]*)/;
		$line =~ s/<span class='history-user'>.*?$//;
		$line =~ s/^.*?Select a newer version for comparison//;
		$line =~ s/^.*?Select a older version for comparison//;
		$line =~ s/^.*?name="diff" \/>//;
		
		$line =~ m%<a href="(.*?)" title="(.*?)">$Pearle::regex_timestamp</a>%;
		$link = $1;
		$hour = $3;
		$minute = $4;
		$day = $5;
		$month = $6;
		$year = $7;

		push @result, [$link, $day, $month, $year, $user];
    }
    
    return (@result);
}


sub getURL #($target)
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
    # Read throttle!
    sleep (1);

    my ($request, $response, $reply, $url);
    
    $url = $_[0];

    # Create a request-object
    myPrint("GET ${url}\n");
    myLog("GET ${url}\n");
    $request = HTTP::Request->new(GET => "${url}");
    $response = startRetry(\&httpRequest, $request);

	$reply = $response->content;
	
	# This may or may not actually work
	$Pearle::ua->cookie_jar->save();

	return ($reply);
}


# Retries a given function repeatedly, with an exponential backoff rate
# The function should throw an exception beginning with "retry:" (case insensitive) if the call should be retried
sub startRetry
{
	my ($call_fn, @args) = @_;
	return retry($Pearle::speedLimit, $call_fn, @args);
}

sub retry
{
	my ($call_fn, @args, $delay, @result, $result);
	
	($delay, $call_fn, @args) = @_;
	
	if(wantarray())
	{
		@result = eval{ $call_fn->(@args) };
		if($@ =~ /^retry:/i)
		{
			limit($delay);
			@result = retry($delay * 2, $call_fn, @args);
		}
		elsif($@)
		{
			die;
		}
		return @result;
	}
	else
	{
		$result = eval{ &{$call_fn}(@args) };
		if($@ =~ /^retry:/i)
		{
			limit($delay);
			$result = retry($delay * 2, $call_fn, @args);
		}
		elsif($@)
		{
			die;
		}
		return $result;
	}
}


sub namespaceToNumber
{
	my $namespace = $_[0];
	my $i = 0;
	my $name;
	if(defined($namespace))
	{
		foreach $name (@Pearle::namespaces)
		{
			return $i if(lc($name) eq lc($namespace));
			$i++;
		}
	}
	else
	{
		return undef;
	}
}

sub numberToNamespace
{
	my $i = shift;
	if(defined($i))
	{
		return $Pearle::namespaces[$i];
	}
	else
	{
		return undef;
	}
}


# Translate from HTTP URL encoding to the native character set.
sub urlDecode
{
	my ($input);

	$input = $_[0];

	$input =~ s/\%([a-f|A-F|0-9][a-f|A-F|0-9])/chr(hex($1))/eg;

	return ($input);
}

# Basic escaping of special characters in a URL
sub escapeUrl
{
	my $input = shift;
	$input =~ s/%/%25/g;
	$input =~ s/&/%26/g;
	
	$input = unicodeToUrl($input);
	
	return $input;
}

# URL-escape any high-unicode chars in a string
sub unicodeToUrl
{
	my ($char, $input, $output);;
	
	$input = $_[0];

	foreach $char (split("",$input))
	{
		if(ord($char) > 255)
		{
			$output .= uc(sprintf("%%%x%%%x", int(ord($char)/256), ord($char) & 0xFF));
			# %HH%LL where HHLL is the hex code of $char
		}
		else
		{
			$output .= $char;
		}
	}
	return $output;
}

# Translate from the native character set to the Wikipedia HTTP URL encoding.
sub urlEncode
{
	my ($char, $input, $output);

	$input = $_[0];

	foreach $char (split("",$input))
	{
		#	if ($char =~ m/[a-z|A-Z|0-9|\-_\.\!\~\*\'\(\)]/)

		# The below exclusions should conform to Wikipedia practice
		# (possibly non-standard)
		if ($char =~ m/[a-z|A-Z|0-9|\-_\.\/:]/)
		{
			$output .= $char;
		}
		elsif ($char eq " ")
		{
			$output .= "_";
		}
		else
		{
			if(ord($char) > 255)
			{
				$output .= uc(sprintf("%%%x%%%x", int(ord($char)/256), ord($char) & 0xFF));
				# %HH%LL where HHLL is the hex code of $char
			}
			else
			{
				$output .= uc(sprintf("%%%x", ord($char)));
				# %HH where HH is the hex code of $char
			}
		}
	}

	return ($output);
}

sub decodeArray
{
	return map {urlDecode($_)} @_;
}

# Remove duplicates from a list
sub uniquify
{
	my @list = @_;
	@list = sort @list;
	my $last = undef;
	my @new_list;
	my $item;
	
	foreach $item (@list)
	{
		push @new_list, $item if(!defined($last) or ($item ne $last));
		$last = $item;
	}
	return @new_list;
}

# Remove duplicates from a list of array references, grouping on the first subelement
sub uniquify_ref1
{
	my @list = @_;
	@list = sort {$a->[0] cmp $b->[0]} @list;
	my $last = undef;
	my @new_list;
	my $item;

	foreach $item (@list)
	{
		push @new_list, $item if(!defined($last) or ($item->[0] ne $last));
		$last = $item->[0];
	}
	return @new_list;
}

1;