Utilisateur:PimpBot/source

Une page de Wikipédia, l'encyclopédie libre.
# -*- coding: utf-8  -*-
"""
This module is based on cosmetic_changes.py.
This module can do slight modifications to a wiki page source code such that
the code looks cleaner. The changes are not supposed to change the look of the
rendered wiki page.
The changes it proposes are mostly targeted to French language.
Feel free to use.

"""
__version__ = '2009-06-05 Neuceu'
import wikipedia, pagegenerators, isbn
import re,codecs


warning = """ATTENTION: You can run this script as a stand-alone for testing purposes.
However, the changes are that are made are only minor, and other users
might get angry if you fill the version histories and watchlists with such
irrelevant changes."""

docuReplacements = {
    '&params;': pagegenerators.parameterHelp,
    '&warning;': warning,
}
# Summary message when using this module as a stand-alone script
msg_standalone = {
    'de': u'Bot: Kosmetische Änderungen',
    'en': u'Robot: Cosmetic changes',
    'fr': u'[[User:PimpBot|PimpBot]] : changements cosmétiques et orthographiques',
    }

# Summary message  that will be appended to the normal message when
# cosmetic changes are made on the fly
msg_append = {
    'de': u'; kosmetische Änderungen',
    'en': u'; cosmetic changes',
    'fr': u'; changements cosmétiques',
    }

deprecatedTemplates = {
    'wikipedia': {
        'de': [
            u'Stub',
        ]
    }
}



# functions to manipulate wikitext strings (by default, all text arguments
# should be Unicode)
# All return the modified text as a unicode object

def replaceExcept2(text, old, new, exceptions, caseInsensitive=False,
                  allowoverlap=False, marker = '', site = None):
    """
    Return text with 'old' replaced by 'new', ignoring specified types of text.

    Skips occurences of 'old' within exceptions; e.g., within nowiki tags or
    HTML comments. If caseInsensitive is true, then use case insensitive
    regex matching. If allowoverlap is true, overlapping occurences are all
    replaced (watch out when using this, it might lead to infinite loops!).

    Parameters:
        text            - a unicode string
        old             - a compiled regular expression
        new             - a unicode string (which can contain regular
                          expression references), or a function which takes
                          a match object as parameter. See parameter repl of
                          re.sub().
        exceptions      - a list of strings which signal what to leave out,
                          e.g. ['math', 'table', 'template']
        caseInsensitive - a boolean
        marker          - a string that will be added to the last replacement;
                          if nothing is changed, it is added at the end

    """
    # Hyperlink regex is defined in weblinkchecker.py
    import weblinkchecker

    if site is None:
        site = wikipedia.getSite()

    exceptionRegexes = {
        'comment':     re.compile(r'(?s)<!--.*?-->'),
        # section headers
        'header':      re.compile(r'\r\n=+.+=+ *\r\n'),
        'includeonly': re.compile(r'(?is)<includeonly>.*?</includeonly>'),
        'math':        re.compile(r'(?is)<math>.*?</math>'),
        'noinclude':   re.compile(r'(?is)<noinclude>.*?</noinclude>'),
        # wiki tags are ignored inside nowiki tags.
        'nowiki':      re.compile(r'(?is)<nowiki>.*?</nowiki>'),
        # preformatted text
        'pre':         re.compile(r'(?ism)<pre>.*?</pre>'),
        'source':      re.compile(r'(?is)<syntaxhighlight .*?</' + r'source>'),
        # inline references
        'ref':         re.compile(r'(?ism)<ref[ >].*?</ref>'),
        'timeline':    re.compile(r'(?is)<timeline>.*?</timeline>'),
        # lines that start with a space are shown in a monospace font and
        # have whitespace preserved.
        'startspace':  re.compile(r'(?m)^ (.*?)$'),
        # tables often have whitespace that is used to improve wiki
        # source code readability.
        # TODO: handle nested tables.
        'table':       re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
        # templates with parameters often have whitespace that is used to
        # improve wiki source code readability.
        # 'template':    re.compile(r'(?s){{.*?}}'),
        # The regex above fails on nested templates. This regex can handle
        # templates cascaded up to level 3, but no deeper. For arbitrary
        # depth, we'd need recursion which can't be done in Python's re.
        # After all, the language of correct parenthesis words is not regular.
        'template':    re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'),
        'hyperlink':   weblinkchecker.compileLinkR(),
        'gallery':     re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
        # this matches internal wikilinks, but also interwiki, categories, and
        # images.
        'link':        re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
        'interwiki':   re.compile(r'(?i)\[\[(%s)\s?:[^\]]*\]\][\s]*'
                               % '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())),
		'isbn':        re.compile(r'ISBN-13[:]?[-\d\s]*|ISBN[-\d\s]*|ASIN[-\d\s:]+'),
		'exotic':      re.compile(r'Tel\s?[:]\s?[\d-]+|#([AaBbCcEeFf]|\d){6}|style=\"[^\"]+\"'),
    }

    # if we got a string, compile it as a regular expression
    if isinstance(old, basestring):
        if caseInsensitive:
            old = re.compile(old, re.IGNORECASE | re.UNICODE)
        else:
            old = re.compile(old)

    dontTouchRegexes = []
    for exc in exceptions:
        if isinstance(exc, basestring):
            # assume it's a reference to the exceptionRegexes dictionary
            # defined above.
            if exc not in exceptionRegexes:
                raise ValueError("Unknown tag type: " + exc)
            dontTouchRegexes.append(exceptionRegexes[exc])
        else:
            # assume it's a regular expression
            dontTouchRegexes.append(exc)
    index = 0
    markerpos = len(text)
    while True:
        match = old.search(text, index)
        if not match:
            # nothing left to replace
            break

        # check which exception will occur next.
        nextExceptionMatch = None
        for dontTouchR in dontTouchRegexes:
            excMatch = dontTouchR.search(text, index)
            if excMatch and (
                    nextExceptionMatch is None or
                    excMatch.start() < nextExceptionMatch.start()):
                nextExceptionMatch = excMatch

        if nextExceptionMatch is not None and nextExceptionMatch.start() <= match.start():
            # an HTML comment or text in nowiki tags stands before the next valid match. Skip.
            index = nextExceptionMatch.end()
        else:
            # We found a valid match. Replace it.
            if callable(new):
                # the parameter new can be a function which takes the match as a parameter.
                replacement = new(match)
            else:
                # it is not a function, but a string.

                # it is a little hack to make \n work. It would be better to fix it
                # previously, but better than nothing.
                new = new.replace('\\n', '\n')

                # We cannot just insert the new string, as it may contain regex
                # group references such as \2 or \g<name>.
                # On the other hand, this approach does not work because it can't
                # handle lookahead or lookbehind (see bug #1731008):
                #replacement = old.sub(new, text[match.start():match.end()])
                #text = text[:match.start()] + replacement + text[match.end():]

                # So we have to process the group references manually.
                replacement = new

                groupR = re.compile(r'\\(?P<number>\d+)|\\g<(?P<name>.+?)>')
                while True:
                    groupMatch = groupR.search(replacement)
                    if not groupMatch:
                        break
                    groupID = groupMatch.group('name') or int(groupMatch.group('number'))
                    replacement = replacement[:groupMatch.start()] + match.group(groupID) + replacement[groupMatch.end():]
            text = text[:match.start()] + replacement + text[match.end():]

            # continue the search on the remaining text
            if allowoverlap:
                index = match.start() + 1
            else:
                index = match.start() + len(replacement)
            markerpos = match.start() + len(replacement)
    text = text[:markerpos] + marker + text[markerpos:]
    return text

    
def replaceExceptMathNowikiLinksGalleryAndComments(text, old, new):
    """
    Replaces old by new in text, skipping occurences of old within nowiki tags
    and HTML comments.
    
    Parameters:
        text - a string
        old  - a compiled regular expression
        new  - a string
    """
    return replaceExcept2( text, old, new, [ 'comment', 'includeonly', 'math', 'table', 'template', 'hyperlink', 'nowiki', 'gallery', 'timeline', 'ref', 'interwiki', 'link', 'isbn', 'exotic'] )


# also no change in the external links
def replaceExceptMathNowikiLinksGalleryAndComments2(text, old, new):
    """
    Replaces old by new in text, skipping occurences of old within nowiki tags
    and HTML comments.
    
    Parameters:
        text - a string
        old  - a compiled regular expression
        new  - a string
    """
    return replaceExcept2( text, old, new, [ 'comment', 'includeonly', 'math', 'table', 'template', 'hyperlink', 'nowiki', 'gallery', 'timeline', 'ref', 'interwiki', 'link', 'isbn', 'exotic'] )




class CosmeticChangesToolkit:
    def __init__(self, site, debug = False):
        self.site = site
        self.debug = debug

    def change(self, text):
        """
        Given a wiki source code text, returns the cleaned up version.
        """
        oldText = text
        text = self.fixSelfInterwiki(text)
        text = self.standardizeInterwiki(text)
        text = self.standardizeCategories(text)
        text = self.cleanUpLinks(text)
        text = self.cleanUpSectionHeaders(text)
        # Disabled because of a bug, and because its usefulness is disputed
        # text = self.putSpacesInLists(text)
        # text = self.translateAndCapitalizeNamespaces(text)
        text = self.removeDeprecatedTemplates(text)
        text = self.resolveHtmlEntities(text)
        text = self.validXhtml(text)
        text = self.removeUselessSpaces(text)
        text = self.removeNonBreakingSpaceBeforePercent(text)
        try:
            text = isbn.hyphenateIsbnNumbers(text)
        except isbn.InvalidIsbnException, error:
            pass
        #text = self.replaceWithNiceQuotes(text)
        text = self.replaceIer(text)
        text = self.replaceNumbers(text)
        text = self.replaceParenthesis(text)
        text = self.replaceCenturies(text)
        text = self.cleanupPonctuation(text)
        
        return text

    def fixSelfInterwiki(self, text):
        """
        Interwiki links to the site itself are displayed like local links.
        Remove their language code prefix.
        """
        interwikiR = re.compile(r'\[\[%s\s?:([^\[\]\n]*)\]\]' % self.site.lang)
        text = interwikiR.sub(r'[[\1]]', text)
        return text

    def standardizeInterwiki(self, text):
        """
        Makes sure that interwiki links are put to the correct position and
        into the right order.
        """
        interwikiLinks = wikipedia.getLanguageLinks(text, insite = self.site)
        text = wikipedia.replaceLanguageLinks(text, interwikiLinks, site = self.site)
        return text

    def standardizeCategories(self, text):
        """
        Makes sure that categories are put to the correct position, but
        does not sort them.
        """
        # The PyWikipediaBot is no longer allowed to touch categories on the German Wikipedia. See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006#Position_der_Personendaten_am_.22Artikelende.22
        if self.site != wikipedia.getSite('de', 'wikipedia'):
            categories = wikipedia.getCategoryLinks(text, site = self.site)
            text = wikipedia.replaceCategoryLinks(text, categories, site = self.site)
        return text

    def translateAndCapitalizeNamespaces(self, text):
        """
        Makes sure that localized namespace names are used.
        """
        family = self.site.family
        # wiki links aren't parsed here.
        exceptions = ['nowiki', 'comment', 'math', 'pre']

        for nsNumber in family.namespaces:
            if not family.isDefinedNSLanguage(nsNumber, self.site.lang):
                # Skip undefined namespaces
                continue
            namespaces = list(family.namespace(self.site.lang, nsNumber, all = True))
            thisNs = namespaces.pop(0)

            # skip main (article) namespace
            if thisNs and namespaces:
                text = wikipedia.replaceExcept(text, r'\[\[\s*(' + '|'.join(namespaces) + ') *:(?P<nameAndLabel>.*?)\]\]', r'[[' + thisNs + ':\g<nameAndLabel>]]', exceptions)
        return text

    def cleanUpLinks(self, text):
        # helper function which works on one link and either returns it
        # unmodified, or returns a replacement.
        def handleOneLink(match):
            titleWithSection = match.group('titleWithSection')
            label = match.group('label')
            trailingChars = match.group('linktrail')

            if not self.site.isInterwikiLink(titleWithSection):
                # The link looks like this:
                # [[page_title|link_text]]trailing_chars
                # We only work on namespace 0 because pipes and linktrails work
                # differently for images and categories.
                try:
                    page = wikipedia.Page(self.site, titleWithSection)
                except wikipedia.InvalidTitle:
                    return match.group()
                if page.namespace() == 0:
                    # Replace underlines by spaces, also multiple underlines
                    titleWithSection = re.sub('_+', ' ', titleWithSection)
                    # Remove double spaces
                    titleWithSection = re.sub('  +', ' ', titleWithSection)
                    # Remove unnecessary leading spaces from title,
                    # but remember if we did this because we eventually want
                    # to re-add it outside of the link later.
                    titleLength = len(titleWithSection)
                    titleWithSection = titleWithSection.lstrip()
                    hadLeadingSpaces = (len(titleWithSection) != titleLength)
                    hadTrailingSpaces = False
                    # Remove unnecessary trailing spaces from title,
                    # but remember if we did this because it may affect
                    # the linktrail and because we eventually want to
                    # re-add it outside of the link later.
                    if not trailingChars:
                        titleLength = len(titleWithSection)
                        titleWithSection = titleWithSection.rstrip()
                        hadTrailingSpaces = (len(titleWithSection) != titleLength)

                    # Convert URL-encoded characters to unicode
                    titleWithSection = wikipedia.url2unicode(titleWithSection, site = self.site)

                    if titleWithSection == '':
                        # just skip empty links.
                        return match.group()

                    # Remove unnecessary initial and final spaces from label.
                    # Please note that some editors prefer spaces around pipes. (See [[en:Wikipedia:Semi-bots]]). We remove them anyway.
                    if label is not None:
                        # Remove unnecessary leading spaces from label,
                        # but remember if we did this because we want
                        # to re-add it outside of the link later.
                        labelLength = len(label)
                        label = label.lstrip()
                        hadLeadingSpaces = (len(label) != labelLength)
                        # Remove unnecessary trailing spaces from label,
                        # but remember if we did this because it affects
                        # the linktrail.
                        if not trailingChars:
                            labelLength = len(label)
                            label = label.rstrip()
                            hadTrailingSpaces = (len(label) != labelLength)
                    else:
                        label = titleWithSection
                    if trailingChars:
                        label += trailingChars

                    if titleWithSection == label or titleWithSection[0].lower() + titleWithSection[1:] == label:
                        newLink = "[[%s]]" % label
                    # Check if we can create a link with trailing characters instead of a pipelink
                    elif label.startswith(titleWithSection) and re.sub(trailR, '', label[len(titleWithSection):]) == '':
                        newLink = "[[%s]]%s" % (label[:len(titleWithSection)], label[len(titleWithSection):])
                    else:
                        # Try to capitalize the first letter of the title.
                        # Maybe this feature is not useful for languages that
                        # don't capitalize nouns...
                        #if not self.site.nocapitalize:
                        if self.site.sitename() == 'wikipedia:de':
                            titleWithSection = titleWithSection[0].upper() + titleWithSection[1:]
                        newLink = "[[%s|%s]]" % (titleWithSection, label)
                    # re-add spaces that were pulled out of the link.
                    # Examples:
                    #   text[[ title ]]text        -> text [[title]] text
                    #   text[[ title | name ]]text -> text [[title|name]] text
                    #   text[[ title |name]]text   -> text[[title|name]]text
                    #   text[[title| name]]text    -> text [[title|name]]text
                    if hadLeadingSpaces:
                        newLink = ' ' + newLink
                    if hadTrailingSpaces:
                        newLink = newLink + ' '
                    return newLink
            # don't change anything
            return match.group()

        trailR = re.compile(self.site.linktrail())
        # The regular expression which finds links. Results consist of four groups:
        # group title is the target page title, that is, everything before | or ].
        # group section is the page section. It'll include the # to make life easier for us.
        # group label is the alternative link title, that's everything between | and ].
        # group linktrail is the link trail, that's letters after ]] which are part of the word.
        # note that the definition of 'letter' varies from language to language.
        linkR = re.compile(r'\[\[(?P<titleWithSection>[^\]\|]+)(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>' + self.site.linktrail() + ')')

        text = wikipedia.replaceExcept(text, linkR, handleOneLink, ['comment', 'math', 'nowiki', 'pre', 'startspace'])
        return text

    def resolveHtmlEntities(self, text):
        ignore = [
             38,     # Ampersand (&amp;)
             60,     # Less than (&lt;)
             62,     # Great than (&gt;)
             91,     # Opening bracket - sometimes used intentionally inside links
             93,     # Closing bracket - sometimes used intentionally inside links
            124,     # Vertical bar (??) - used intentionally in navigation bar templates on de:
            160,     # Non-breaking space (&nbsp;) - not supported by Firefox textareas
        ]
        text = wikipedia.html2unicode(text, ignore = ignore)
        return text

    def validXhtml(self, text):
        text = wikipedia.replaceExcept(text, r'<br>', r'<br />', ['comment', 'math', 'nowiki', 'pre'])
        return text

    def removeUselessSpaces(self, text):
        result = []
        multipleSpacesR = re.compile('  +')
        spaceAtLineEndR = re.compile(' $')

        exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace', 'table', 'template']
        text = wikipedia.replaceExcept(text, multipleSpacesR, ' ', exceptions)
        text = wikipedia.replaceExcept(text, spaceAtLineEndR, '', exceptions)

        return text

    def removeNonBreakingSpaceBeforePercent(self, text):
        '''
        Newer MediaWiki versions automatically place a non-breaking space in
        front of a percent sign, so it is no longer required to place it
        manually.
        '''
        percentR = re.compile(r'(\d)&nbsp;%')
        text = percentR.sub(r'\1 %', text)
        return text

    def cleanUpSectionHeaders(self, text):
        """
        For better readability of section header source code, puts a space
        between the equal signs and the title.
        Example: ==Section title== becomes == Section title ==

        NOTE: This space is recommended in the syntax help on the English and
        German Wikipedia. It might be that it is not wanted on other wikis.
        If there are any complaints, please file a bug report.
        """
        for level in range(1, 7):
            equals = '=' * level
            text = wikipedia.replaceExcept(text, r'\n' + equals + ' *(?P<title>[^=]+?) *' + equals + ' *\r\n', '\n' + equals + ' \g<title> ' + equals + '\r\n', ['comment', 'math', 'nowiki', 'pre'])
        return text

    def putSpacesInLists(self, text):
        """
        For better readability of bullet list and enumeration wiki source code,
        puts a space between the * or # and the text.

        NOTE: This space is recommended in the syntax help on the English, German,
        and French Wikipedia. It might be that it is not wanted on other wikis.
        If there are any complaints, please file a bug report.
        """
        # FIXME: This breaks redirects.
        text = wikipedia.replaceExcept(text, r'(?m)^(?P<bullet>(\*+|#+):*)(?P<char>[^\s\*#:].+?)', '\g<bullet> \g<char>', ['comment', 'math', 'nowiki', 'pre'])
        return text

    def removeDeprecatedTemplates(self, text):
        if deprecatedTemplates.has_key(self.site.family.name) and deprecatedTemplates[self.site.family.name].has_key(self.site.lang):
            for template in deprecatedTemplates[self.site.family.name][self.site.lang]:
                if not self.site.nocapitalize:
                    template = '[' + template[0].upper() + template[0].lower() + ']' + template[1:]
                text = wikipedia.replaceExcept(text, r'\{\{([mM][sS][gG]:)?' + template + '(?P<parameters>\|[^}]+|)}}', '', ['comment', 'math', 'nowiki', 'pre'])
        return text
        
    
    # Change ' to ’
    def replaceWithNiceQuotes(self, text):
        text2 = replaceExceptMathNowikiLinksGalleryAndComments( text, r'([^\'])\'([^\'\[])', r'\1' + u'’' +r'\2' )
        text3 = replaceExceptMathNowikiLinksGalleryAndComments( text2, r'([^\'])\'\[', r'\1' + u'’['  )
        text4 = wikipedia.replaceExcept( text3, r'\[\[([^:|\]\']*?)\'([^:|\]\']*?)\]\]', r'[[\1' + u'\'' + r'\2|\1' + u'’' + r'\2]]', [ 'math', 'comment', 'noinclude', 'nowiki' ] )
        return text4
        
    def replaceIer(self, text):
        text2 = replaceExceptMathNowikiLinksGalleryAndComments( text, r'(\s)Ier(\s)', r'\1I{{er}}\2' )
        text3 = wikipedia.replaceExcept( text2, r'\[\[([^:|\]]*?)Ier([^:|\]]*?)\]\]', r'[[\1' + u'Ier' + r'\2|\1' + u'I{{er}}' + r'\2]]', [ 'math', 'comment', 'noinclude', 'nowiki' ] )
        text4 = replaceExceptMathNowikiLinksGalleryAndComments( text3, r'(\s)(1)er(\s)', r'\1\2{{er}}\3' )
        text5 = wikipedia.replaceExcept( text4, r'\[\[([^:|\]]*?)1er([^:|\]]*?)\]\]', r'[[\1' + u' 1er' + r'\2|\1' + u' 1{{er}}' + r'\2]]', [ 'math', 'comment', 'noinclude', 'nowiki' ] )
        text6 = wikipedia.replaceExcept( text5, r'(\s)(\d+)(eme|ème|e)(\s)', r'\1\2{{e}}\4', [ 'math', 'comment', 'noinclude', 'nowiki' ] )
        return text6
    
    def replaceNumbers(self, text):
        text2 = replaceExceptMathNowikiLinksGalleryAndComments2( text, r'(\d{5,10})', r'' + u'{{formatnum:' + r'\1' + u'}}' )
        text3 = replaceExceptMathNowikiLinksGalleryAndComments2( text2, r'([^\w\d])(\d)( |&nbsp[;])(\d{3})([^\w/])', r'\1\2\4\5' )
        text4 = replaceExceptMathNowikiLinksGalleryAndComments2( text3, r'([^\w])(\d+)( |&nbsp[;])(\d{3})( |&nbsp[;])(\d{3})([^\w/])', r'\1' + u'{{formatnum:' + r'\2\4\6' + u'}}' +r'\7' )
        text5 = replaceExceptMathNowikiLinksGalleryAndComments2( text4, r'([^\w])(\d+)( |&nbsp[;])(\d{3})([^\w/])', r'\1' + u'{{formatnum:' + r'\2\4' + u'}}' + r'\5' )
        return text5
    
    def replaceParenthesis(self, text):
        text2 = replaceExceptMathNowikiLinksGalleryAndComments( text, r'([^\s])\(', r'\1 (' )
        return text2
    
    def replaceCenturies(self, text):
        text2 = replaceExceptMathNowikiLinksGalleryAndComments( text, r'([XVI]+)(e|' +u'è' + r'me|eme)' + u' siècle' + r'([^\w])(av|av\.|avant)\s(JC|J\.C\.|J\.-C|J\.-C\.|J-C)', u'{{' + r'\1' + u'e siècle av. J.-C.}}' )
        text3 = replaceExceptMathNowikiLinksGalleryAndComments( text2, r'([XVI]+)(e|' +u'è' + r'me|eme)' + u' siècle' + r'([^\w])', u'{{' + r'\1' + u'e siècle}}' + r'\3' )
        text4 = wikipedia.replaceExcept( text3, r'\[\[([XVI]+e' +u' siècle)\]\]', r'{{\1}}', [ 'math', 'comment', 'noinclude', 'nowiki' ] )
        return text4
        
    def cleanupPonctuation( self, text ):
        text2 = replaceExceptMathNowikiLinksGalleryAndComments( text, u'\.\.\.', u'…' )
        text3 = replaceExceptMathNowikiLinksGalleryAndComments( text2, r'([A-Za-z])\s?:(\s)([A-Za-z])', r'\1 :\2\3' )
        text4 = replaceExceptMathNowikiLinksGalleryAndComments( text3, r'([A-Za-z])\s?:([A-Za-z])', r'\1 : \2' )
        #text5 = replaceExceptMathNowikiLinksGalleryAndComments( text4, r'([A-Za-z])\s?;(\s)([A-Za-z])', r'\1 ;\2\3' )
        #text6 = replaceExceptMathNowikiLinksGalleryAndComments( text5, r'([A-Za-z])\s?;([A-Za-z])', r'\1 ; \2' )
        #text7 = replaceExceptMathNowikiLinksGalleryAndComments( text6, r'([A-Za-z]);', r'\1 ;' )
        text8 = replaceExceptMathNowikiLinksGalleryAndComments2( text3, r'([A-Za-z])\s?\.\s?([A-Za-z])', r'\1' +u'. ' + r'\2' )
        return text8
        
    
class DictToolkit:
    def __init__(self, site, debug = False):
        self.site = site
        self.debug = debug
        self.words = {}
        try:
            f = codecs.open( "dict.txt", 'r', encoding = site.encoding())
            for line in f:
                # remove trailing newlines and carriage returns
                line = line.rstrip('\r\n')
                #skip empty lines
                if line:
                    w = line.split(' ')
                    self.words[w[0]] = w[1]
            f.close()
        except IOError:
            print "Warning! There is no wordlist for your language!"
        else:
            print "Wordlist successfully loaded."
        
    def change(self, text):
        ct = text
        for (k, v) in self.words.iteritems():
            ct = replaceExceptMathNowikiLinksGalleryAndComments( ct, k, v )
        return ct
        
    def bad_text(self, text):
        return any(re.search(k, text) for k in self.words)
        
class PimpBot:
    def __init__(self, generator, acceptall = False):
        self.generator = generator
        self.acceptall = acceptall
        self.debug = not acceptall
        # Load default summary message.
        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg_standalone))
        self.ccToolkit = CosmeticChangesToolkit(wikipedia.getSite(), debug = self.debug)
        self.dictToolkit = DictToolkit( wikipedia.getSite(), debug = self.debug)
        
    
    def run(self):
        for page in self.generator:
            try:
                changedText = page.get()
                changedText = self.ccToolkit.change(changedText)
                changedText = self.dictToolkit.change(changedText)
                if changedText != page.get():
                    if self.debug:
                        wikipedia.showDiff(page.get(), changedText)
                    
                    if not self.acceptall:
                        choice = wikipedia.inputChoice(u'Do you want to accept these changes?',  ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
                        if choice in ['a', 'A']:
                            self.acceptall = True
                    if self.acceptall or choice in ['y', 'Y']:
                        page.put(changedText)
            except wikipedia.NoPage:
                print "Page %s does not exist?!" % page.aslink()
            except wikipedia.IsRedirectPage:
                print "Page %s is a redirect; skipping." % page.aslink()
            except wikipedia.LockedPage:
                print "Page %s is locked?!" % page.aslink()
            

def main():
    #page generator
    gen = None
    pageTitle = []
    
    # This factory is responsible for processing command line arguments
    # that are also used by other scripts and that determine on which pages
    # to work on.
    genFactory = pagegenerators.GeneratorFactory()
    
    for arg in wikipedia.handleArgs():
        if not genFactory.handleArg(arg):
            pageTitle.append(arg)

    if pageTitle:
        page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
        gen = iter([page])
    if not gen:
        wikipedia.showHelp()
    else:
        preloadingGen = pagegenerators.PreloadingGenerator(gen)
        bot = PimpBot(preloadingGen)
        print "Bot is running"
        bot.run()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()