From 9f71de8c0e91c16b61d8a6bf1e1cf23056d6d04d Mon Sep 17 00:00:00 2001
From: gyc990326 <gyc990326@gmail.com>
Date: Wed, 5 Oct 2016 11:10:13 +0800
Subject: [PATCH 1/2] Add files via upload

Add support to backup using only API instead of Special:Export
---
 dumpgenerator.py | 250 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 243 insertions(+), 7 deletions(-)
diff --git a/dumpgenerator.py b/dumpgenerator.py
index a045ace5..3bada739 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
+# encoding=utf8
 
 # dumpgenerator.py A generator of dumps for wikis
 # Copyright (C) 2011-2016 WikiTeam developers
@@ -23,10 +24,20 @@
     from kitchen.text.converters import getwriter
 except ImportError:
     print "Please install the kitchen module."
+
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+import xml.dom.minidom as MD
+
 import cookielib
 import cPickle
 import datetime
 import sys
+import io
+import traceback
 try:
     import argparse
 except ImportError:
@@ -436,8 +447,12 @@ def getXMLHeader(config={}, session=None):
     # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
     # xmlns:x....
     randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
+    # when params['curonly'] is not set, API does not return the namespace info
+    # since we does not need the page history here, just set it temporarily
+    curonly = config['curonly']
+    config['curonly'] = True
     try:
-        xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+        xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
     except PageMissingError as pme:
         # The <page> does not exist. Not a problem, if we get the <siteinfo>.
         xml = pme.xml
@@ -458,12 +473,12 @@ def getXMLHeader(config={}, session=None):
                 )
                 config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
                     + ':Export'
-                xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+                xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
         except PageMissingError as pme:
             xml = pme.xml
         except ExportAbortedError:
             pass
-
+    config['curonly'] = curonly
     header = xml.split('</mediawiki>')[0]
     if not re.match(r"\s*<mediawiki", xml):
         print 'XML export on this wiki is broken, quitting.'
@@ -475,7 +490,7 @@ def getXMLHeader(config={}, session=None):
 def getXMLFileDesc(config={}, title='', session=None):
     """ Get XML for image description page """
     config['curonly'] = 1  # tricky to get only the most recent desc
-    return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))
+    return("".join([x for x in getXMLPage_( config=config, title=title, verbose=False, session=session)]))
 
 
 def getUserAgent():
@@ -496,6 +511,218 @@ def logerror(config={}, text=''):
                 datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
             outfile.write(output.encode('utf-8'))
 
+def reconstructRevisions(root=None):
+    #print ET.tostring(rev)
+    page = ET.Element('stub')
+    edits = 0
+    for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'):
+        try:
+            rev_ = ET.SubElement(page,'revision')
+            ET.SubElement(rev_,'id').text = rev.attrib['revid']
+            ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp']
+            contributor = ET.SubElement(rev_,'contributor')
+            if not rev.attrib.has_key('userhidden'):
+                ET.SubElement(contributor,'username').text = rev.attrib['user']
+                ET.SubElement(contributor,'id').text = rev.attrib['userid']
+            else:
+                contributor.set('deleted','deleted')
+            comment = ET.SubElement(rev_,'comment')
+            if not rev.attrib.has_key('commenthidden'):
+                comment.text = rev.attrib['comment']
+            else:
+                comment.set('deleted','deleted')
+
+            # some revision does not return model and format, so just use hard-code
+            ET.SubElement(rev_,'model').text = 'wikitext'
+            ET.SubElement(rev_,'format').text = 'text/x-wiki'
+            text = ET.SubElement(rev_,'text')
+            if not rev.attrib.has_key('texthidden'):
+                text.attrib['xml:space'] = "preserve"
+                text.attrib['bytes'] = rev.attrib['size']
+                text.text = rev.text
+            else:
+                text.set('deleted','deleted')
+            # delete sha1 here :)
+            #sha1 = ET.SubElement(rev_,'sha1')
+            #if not rev.attrib.has_key('sha1missing'):
+                #sha1.text = rev.attrib['sha1']
+            if rev.attrib.has_key('minor'):
+                ET.SubElement(rev_,'minor')
+            edits += 1
+        except Exception as e:
+            #logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev)))
+            print ET.tostring(rev)
+            traceback.print_exc()
+            page = None
+            edits = 0
+            raise e
+    return page,edits
+
+def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None):
+    """  """
+    # just send the API request
+    # if it fails, it will reduce params['rvlimit']
+    xml = ''
+    c = 0
+    maxseconds = 100  # max seconds to wait in a single sleeping
+    maxretries = config['retries']  # x retries and skip
+    increment = 20  # increment every retry
+
+    while not re.search(r'</api>' if not config['curonly'] else r'</mediawiki>', xml) or re.search(r'</error>', xml):
+        if c > 0 and c < maxretries:
+            wait = increment * c < maxseconds and increment * \
+                c or maxseconds  # incremental until maxseconds
+            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiexport'] else 'pages'], wait)
+            time.sleep(wait)
+            # reducing server load requesting smallest chunks (if curonly then
+            # rvlimit = 1 from mother function)
+            if params['rvlimit'] > 1:
+                params['rvlimit'] = params['rvlimit'] / 2  # half
+        if c >= maxretries:
+            print '    We have retried %d times' % (c)
+            print '    MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiexport'] else 'pages'])
+            # If it's not already what we tried: our last chance, preserve only the last revision...
+            # config['curonly'] means that the whole dump is configured to save only the last,
+            # params['curonly'] should mean that we've already tried this
+            # fallback, because it's set by the following if and passed to
+            # getXMLPageCore
+            # TODO: save only the last version when failed
+            print '    Saving in the errors log, and skipping...'
+            logerror(
+                config=config,
+                text=u'Error while retrieving the last revision of "%s". Skipping.' %
+                (params['titles' if config['apiexport'] else 'pages']).decode('utf-8'))
+            #raise ExportAbortedError(config['index'])
+            return ''  # empty xml
+
+        # FIXME HANDLE HTTP Errors HERE
+        try:
+            r = session.get(url=config['api'], params=params, headers=headers)
+            handleStatusCode(r)
+            xml = fixBOM(r)
+            #print xml
+        except requests.exceptions.ConnectionError as e:
+            print '    Connection error: %s'%(str(e[0]))
+            xml = ''
+        c += 1
+    return xml
+    
+
+def getXMLPageWithApi(config={}, title='', verbose=True, session=None):
+    """ Get the full history (or current only) of a page using API:Query
+        if params['curonly'] is set, then using export&exportwrap to export
+    """
+
+    title_ = title
+    title_ = re.sub(' ', '_', title_)
+    # do not convert & into %26, title_ = re.sub('&', '%26', title_)
+    # action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
+    # &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
+    #print 'current:%s' % (title_)
+    if not config['curonly']:
+        params = {'titles': title_, 'action': 'query','format':'xml',
+            'prop':'revisions',
+            'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags',
+            'rvcontinue' : None,
+            'rvlimit' : 10 # TODO: set this by commandline
+        }
+    else:
+        params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1}
+    #print 'params:%s' % (params)
+    if not config['curonly']:
+        firstpartok = False
+        lastcontinue = None
+        numberofedits = 0
+        ret = ''
+        while True:
+            # in case the last request is not right, saving last time's progress
+            if not firstpartok:
+                try:
+                    lastcontinue = params['rvcontinue']
+                except:
+                    lastcontinue = None
+            
+            xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+            if xml == "":
+                #just return so that we can continue, and getXMLPageCoreWithApi will log the error
+                return
+            try:
+                root = ET.fromstring(xml.encode('utf-8'))
+            except:
+                continue
+            try:
+                retpage = root.find('query').find('pages').find('page')
+            except:
+                continue
+            if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'):
+                print 'Page not found'
+                raise PageMissingError(params['titles'], xml)
+            if not firstpartok:
+                try:
+                    # build the firstpart by ourselves to improve the memory usage
+                    ret  = '  <page>\n'
+                    ret += '    <title>%s</title>\n' %(retpage.attrib['title'])
+                    ret += '    <ns>%s</ns>\n' % (retpage.attrib['ns'])
+                    ret += '    <id>%s</id>\n' % (retpage.attrib['pageid'])
+                except:
+                    firstpartok = False
+                    continue
+                else:
+                    firstpartok = True
+                    yield ret
+            try:
+                ret = ''
+                edits = 0
+                if config['curonly'] or root.find('continue') == None:
+                    # transform the revision
+                    rev_,edits = reconstructRevisions(root=root)
+                    xmldom = MD.parseString('<stub1>'+ET.tostring(rev_)+'</stub1>')
+                    # convert it into text in case it throws MemoryError
+                    # delete the first three line and last two line,which is for setting the indent
+                    ret += ''.join(xmldom.toprettyxml(indent='  ').splitlines(True)[3:-2])
+                    yield ret
+                    numberofedits += edits
+                    break
+                else:
+                    rev_,edits = reconstructRevisions(root=root)
+                    xmldom = MD.parseString('<stub1>' + ET.tostring(rev_) + '</stub1>')
+                    ret += ''.join(xmldom.toprettyxml(indent='  ').splitlines(True)[3:-2])
+                    params['rvcontinue'] = root.find('continue').attrib['rvcontinue']
+                    numberofedits += edits
+                    yield ret
+            except:
+                traceback.print_exc()
+                params['rvcontinue'] = lastcontinue
+                ret = ''
+        yield '  </page>\n'
+    else:
+        xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+        if xml == "":
+            raise ExportAbortedError(config['index'])
+        if not "</page>" in xml:
+            raise PageMissingError(params['titles'], xml)
+        else:
+            # strip these sha1s sums which keep showing up in the export and
+            # which are invalid for the XML schema (they only apply to
+            # revisions)
+            xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
+            xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
+
+        yield xml.split("</page>")[0]
+
+        # just for looking good :)
+        r_timestamp = r'<timestamp>([^<]+)</timestamp>'
+
+        numberofedits = 0
+        numberofedits += len(re.findall(r_timestamp, xml))
+		
+        yield "</page>\n"
+
+    if verbose:
+        if (numberofedits == 1):
+           print '    %s, 1 edit' % (title.strip())
+        else:
+           print '    %s, %d edits' % (title.strip(), numberofedits)
 
 def getXMLPageCore(headers={}, params={}, config={}, session=None):
     """  """
@@ -663,6 +890,13 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
         else:
            print '    %s, %d edits' % (title.strip(), numberofedits)
 
+def getXMLPage_(config={}, title='', verbose=True, session=None):
+    #print config
+    if config['apiexport']:
+        return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session)
+    else:
+        return getXMLPage(config=config, title=title, verbose=verbose, session=session)
+    return ''
 
 def cleanXML(xml=''):
     """ Trim redundant info """
@@ -710,7 +944,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
         if c % 10 == 0:
             print 'Downloaded %d pages' % (c)
         try:
-            for xml in getXMLPage(config=config, title=title, session=session):
+            for xml in getXMLPage_(config=config, title=title, session=session):
                 xml = cleanXML(xml=xml)
                 xmlfile.write(xml.encode('utf-8'))
         except PageMissingError:
@@ -1211,9 +1445,8 @@ def welcome():
     message += ''
     message += "\n"
     message += "#" * 73
-    message += "\n"
-    message += "# Copyright (C) 2011-%d WikiTeam developers                           #\n" % (datetime.datetime.now().year)
     message += """
+# Copyright (C) 2011-2014 WikiTeam                                      #
 # This program is free software: you can redistribute it and/or modify  #
 # it under the terms of the GNU General Public License as published by  #
 # the Free Software Foundation, either version 3 of the License, or     #
@@ -1311,6 +1544,8 @@ def getParameters(params=[]):
         '--exnamespaces',
         metavar="1,2,3",
         help='comma-separated value of namespaces to exclude')
+    groupDownload.add_argument(
+        '--apiexport', action='store_true', help="Using API instead of Special:Export to export pages")
 
     # Meta info params
     groupMeta = parser.add_argument_group(
@@ -1494,6 +1729,7 @@ def getParameters(params=[]):
         'cookies': args.cookies or '',
         'delay': args.delay,
         'retries': int(args.retries),
+        'apiexport' : args.apiexport
     }
 
     other = {

From 8bc01207a810104d9271597f252d1989408311d5 Mon Sep 17 00:00:00 2001
From: gyc990326 <gyc990326@gmail.com>
Date: Wed, 5 Oct 2016 11:17:35 +0800
Subject: [PATCH 2/2] Add files via upload

Merge Changes
---
 dumpgenerator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 3bada739..851b7bc3 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
-# encoding=utf8
 
 # dumpgenerator.py A generator of dumps for wikis
 # Copyright (C) 2011-2016 WikiTeam developers
@@ -1445,8 +1444,9 @@ def welcome():
     message += ''
     message += "\n"
     message += "#" * 73
+    message += "\n"
+    message += "# Copyright (C) 2011-%d WikiTeam developers                           #\n" % (datetime.datetime.now().year)
     message += """
-# Copyright (C) 2011-2014 WikiTeam                                      #
 # This program is free software: you can redistribute it and/or modify  #
 # it under the terms of the GNU General Public License as published by  #
 # the Free Software Foundation, either version 3 of the License, or     #