Utilisateur:Darkoneko/ImageLinkErasor.py

# -*- coding: utf-8 -*-
# this script deletes any calls to deleted images. Image list is fetched from the deletion log. 
# Distributed under the terms of the PSF license.

# Auteurs : Dake (2005) ; Darkoneko (aout 2006), Bayo (dec 2006), Phe (janvier 2007), Louperivois (mai 2007)
#(v2) refonte pour utiliser /w/api.php, fusion avec le script listant les images : fr:user:Darkoneko (avril 2008 + )
#(v3) refonte pour utiliser directement la classe de replace.py et les nouveaux generateurs de pywikipediabot (oct 2010 + )
#Version en cours : 3.2 - 24/10/2010
 
import pagegenerators, wikipedia as pywikibot
import config, sys, urllib, replace
from xml.dom.minidom import parse, parseString

#========================================== PARAMETRES ========================
#Namespaces in which we can remove dead media links. pour faire des tests, créez une sous page user et autorisez le namespace 2
allowedNamespaces = [
#'2',
  '0', # (main)
  '10', #template
  '12', #aide
  '14', #category
  '100', #portal
  '102', #projet
  '104' #reference
]
#===============================================================================


#phe janvier 2007
def build_regex_from_title(title):
    result = u''
    for ch in title:
        if ch in u'()*.': # |[] is invalid in title no need to quote it
            result += u'\\'
        result += ch
    result = result.replace(u' ', u'[ _]?')
    result = u'[' + result[0].lower() + result[0].upper() + u']' + result[1:]
    return result


#create the regex that will be used to delete image call occurences and return them as a list
def create_regexes(img) :
    regexed_image = build_regex_from_title(img)   

    #cas du genre [[ Image : blabla.gif]] qui reviennent souvent
    regexed_namespace = u'[ _]*(?:image|m[ée]dia|fi(?:chier|le))[ _]*:[ _]*'

    #liens du type [[ Image : blabla.gif | blablabla [[eventuel lien]] ]]
    regexp = '(?i)\[\[%s%s.*?(\[\[.*?\]\].*?)*\]\]' % (regexed_namespace, regexed_image)

    #ce regex repere les lignes du type "Image:blablabla|commentaire" que contiennent les <gallery>.
    #un \n est ajouté lors du remplacement pour pallier au fait qu'il y en ai 2 detectés par la regex
    regexpTagNoLinkGallery = '(?i)\n%s%s\s*(:?\|[^\n]*)*\n' % (regexed_namespace, regexed_image)
    replaceTagNoLinkGallery = u'\n'

    #ce regex detecte les <gallery> vides et les <center> vides (regulierement present autour des images, j'ai remarqué)
    regexpEmptyGallery = '(?i)<gallery>\s*</gallery>'
    regexpEmptyCenter = '(?i)<center>\s*</center>'

    #ce regex detecte si l'image a été passée en parametre à un modele
    #possibilités : |Image:blabla.jpg ; |blabla.jpg ; |img=Image:blabla.jpg ; |img=blabla.jpg
    regexpNoTag = u'(?P<beforeZone>\||=)\s*(?:%s|)%s\s*' % ( regexed_namespace, regexed_image)
    replaceNoTag = r'\g<beforeZone>' 

    #ce regex detecte les {{son}} et {{listen}} dont le 2eme parametre (le nom du fichier) est maintenant vide
    regexpSound = '{{\s*([Ss]on|[Ll]isten)[\n\s]*\|[^\|}]+\|[\n\s]*}}'

    return [
      (regexp, u''),
      (regexpTagNoLinkGallery, replaceTagNoLinkGallery), 
      (regexpNoTag, replaceNoTag), 
      (regexpEmptyGallery, u''),
      (regexpEmptyCenter, u''), 
      (regexpSound, u''), 
    ]


#get the pages containing a link of any sort to the image.
def get_related_pages(nom_image) :
    url = u"/w/api.php?format=xml&action=query&list=imageusage&iutitle=%s&iunamespace=%s" % (urllib.quote(nom_image.encode("utf-8")), "|".join(allowedNamespaces) )
    res = pywikibot.getSite().getUrl(url)
    xm = parseString( res.encode( "utf-8" ) )
    relpages = [] 
    for node in xm.documentElement.getElementsByTagName("iu") : 
	relpages.append(node.getAttribute('title') )
    return relpages


#test the non-existence of an image. If it exists on commons or was restored, return True.
def image_exists(nom_image):
    url = u"/w/api.php?format=xml&action=query&prop=imageinfo&titles=%s" % urllib.quote(nom_image.encode("utf-8"))
    res = pywikibot.getSite().getUrl(url)
    xm = parseString( res.encode( "utf-8" ) )
    if len( xm.documentElement.getElementsByTagName("page")[0].getAttribute('imagerepository') ) != 0 :
	return True
    return False

#get the image list. It's not possible to limit the API to namespace:6 for logs so we have to filter ourselves.
def get_deleted_images_list() :
    logUrl = "/w/api.php?format=xml&action=query&letype=delete&list=logevents&leprop=title&lelimit=500"
    res = pywikibot.getSite().getUrl(logUrl)
    xm = parseString( res.encode( "utf-8" ) )
    liste = []
    for node in xm.documentElement.getElementsByTagName("item") : 
	if node.getAttribute('ns') == '6' :
	    liste.append(node.getAttribute('title') )  
    return liste



listInput = get_deleted_images_list() 

pywikibot.output(u"%s images were found in the last 500 deletions. " % len(listInput) ) 

for image in listInput :
    nom_image = urllib.unquote(image.strip())

    if image_exists(nom_image) : 
	continue #the image was either restored or exists on commons.

    relatedPagesList = get_related_pages(nom_image)
    nb_related_pages = len(relatedPagesList)
    if nb_related_pages == 0: 
	continue #no links to that image. go to next one.
      
    pywikibot.output(u"%s lien(s) trouvé(s) vers %s" % (nb_related_pages, nom_image) )	

    replaceList = create_regexes(nom_image[8:])
    pywikibot.setAction(u"Robot: retrait lien vers média effacé (%s) + ménage éventuel" % nom_image)
    pagegen = pagegenerators.PagesFromTitlesGenerator( relatedPagesList )
    bot = replace.ReplaceRobot( pagegen, replaceList, [], acceptall=True)
    bot.run()