From 9f71de8c0e91c16b61d8a6bf1e1cf23056d6d04d Mon Sep 17 00:00:00 2001 From: gyc990326 Date: Wed, 5 Oct 2016 11:10:13 +0800 Subject: [PATCH 1/2] Add files via upload Add support to backup using only API instead of Special:Export --- dumpgenerator.py | 250 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 243 insertions(+), 7 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index a045ace5..3bada739 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1,5 +1,6 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- +# encoding=utf8 # dumpgenerator.py A generator of dumps for wikis # Copyright (C) 2011-2016 WikiTeam developers @@ -23,10 +24,20 @@ from kitchen.text.converters import getwriter except ImportError: print "Please install the kitchen module." + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +import xml.dom.minidom as MD + import cookielib import cPickle import datetime import sys +import io +import traceback try: import argparse except ImportError: @@ -436,8 +447,12 @@ def getXMLHeader(config={}, session=None): # similar to: does not exist. Not a problem, if we get the . xml = pme.xml @@ -458,12 +473,12 @@ def getXMLHeader(config={}, session=None): ) config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \ + ':Export' - xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) + xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)]) except PageMissingError as pme: xml = pme.xml except ExportAbortedError: pass - + config['curonly'] = curonly header = xml.split('')[0] if not re.match(r"\s*' if not config['curonly'] else r'', xml) or re.search(r'', xml): + if c > 0 and c < maxretries: + wait = increment * c < maxseconds and increment * \ + c or maxseconds # incremental until maxseconds + print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiexport'] else 'pages'], wait) + time.sleep(wait) + # reducing server load requesting smallest chunks (if curonly then + # rvlimit = 1 from mother function) + if params['rvlimit'] > 1: + params['rvlimit'] = params['rvlimit'] / 2 # half + if c >= maxretries: + print ' We have retried %d times' % (c) + print ' MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiexport'] else 'pages']) + # If it's not already what we tried: our last chance, preserve only the last revision... + # config['curonly'] means that the whole dump is configured to save only the last, + # params['curonly'] should mean that we've already tried this + # fallback, because it's set by the following if and passed to + # getXMLPageCore + # TODO: save only the last version when failed + print ' Saving in the errors log, and skipping...' + logerror( + config=config, + text=u'Error while retrieving the last revision of "%s". Skipping.' % + (params['titles' if config['apiexport'] else 'pages']).decode('utf-8')) + #raise ExportAbortedError(config['index']) + return '' # empty xml + + # FIXME HANDLE HTTP Errors HERE + try: + r = session.get(url=config['api'], params=params, headers=headers) + handleStatusCode(r) + xml = fixBOM(r) + #print xml + except requests.exceptions.ConnectionError as e: + print ' Connection error: %s'%(str(e[0])) + xml = '' + c += 1 + return xml + + +def getXMLPageWithApi(config={}, title='', verbose=True, session=None): + """ Get the full history (or current only) of a page using API:Query + if params['curonly'] is set, then using export&exportwrap to export + """ + + title_ = title + title_ = re.sub(' ', '_', title_) + # do not convert & into %26, title_ = re.sub('&', '%26', title_) + # action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE + # &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize + #print 'current:%s' % (title_) + if not config['curonly']: + params = {'titles': title_, 'action': 'query','format':'xml', + 'prop':'revisions', + 'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags', + 'rvcontinue' : None, + 'rvlimit' : 10 # TODO: set this by commandline + } + else: + params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1} + #print 'params:%s' % (params) + if not config['curonly']: + firstpartok = False + lastcontinue = None + numberofedits = 0 + ret = '' + while True: + # in case the last request is not right, saving last time's progress + if not firstpartok: + try: + lastcontinue = params['rvcontinue'] + except: + lastcontinue = None + + xml = getXMLPageCoreWithApi(params=params, config=config, session=session) + if xml == "": + #just return so that we can continue, and getXMLPageCoreWithApi will log the error + return + try: + root = ET.fromstring(xml.encode('utf-8')) + except: + continue + try: + retpage = root.find('query').find('pages').find('page') + except: + continue + if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'): + print 'Page not found' + raise PageMissingError(params['titles'], xml) + if not firstpartok: + try: + # build the firstpart by ourselves to improve the memory usage + ret = ' \n' + ret += ' %s\n' %(retpage.attrib['title']) + ret += ' %s\n' % (retpage.attrib['ns']) + ret += ' %s\n' % (retpage.attrib['pageid']) + except: + firstpartok = False + continue + else: + firstpartok = True + yield ret + try: + ret = '' + edits = 0 + if config['curonly'] or root.find('continue') == None: + # transform the revision + rev_,edits = reconstructRevisions(root=root) + xmldom = MD.parseString(''+ET.tostring(rev_)+'') + # convert it into text in case it throws MemoryError + # delete the first three line and last two line,which is for setting the indent + ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2]) + yield ret + numberofedits += edits + break + else: + rev_,edits = reconstructRevisions(root=root) + xmldom = MD.parseString('' + ET.tostring(rev_) + '') + ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2]) + params['rvcontinue'] = root.find('continue').attrib['rvcontinue'] + numberofedits += edits + yield ret + except: + traceback.print_exc() + params['rvcontinue'] = lastcontinue + ret = '' + yield ' \n' + else: + xml = getXMLPageCoreWithApi(params=params, config=config, session=session) + if xml == "": + raise ExportAbortedError(config['index']) + if not "" in xml: + raise PageMissingError(params['titles'], xml) + else: + # strip these sha1s sums which keep showing up in the export and + # which are invalid for the XML schema (they only apply to + # revisions) + xml = re.sub(r'\n\s*\w+\s*\n', r'\n', xml) + xml = re.sub(r'\n\s*\s*\n', r'\n', xml) + + yield xml.split("")[0] + + # just for looking good :) + r_timestamp = r'([^<]+)' + + numberofedits = 0 + numberofedits += len(re.findall(r_timestamp, xml)) + + yield "\n" + + if verbose: + if (numberofedits == 1): + print ' %s, 1 edit' % (title.strip()) + else: + print ' %s, %d edits' % (title.strip(), numberofedits) def getXMLPageCore(headers={}, params={}, config={}, session=None): """ """ @@ -663,6 +890,13 @@ def getXMLPage(config={}, title='', verbose=True, session=None): else: print ' %s, %d edits' % (title.strip(), numberofedits) +def getXMLPage_(config={}, title='', verbose=True, session=None): + #print config + if config['apiexport']: + return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session) + else: + return getXMLPage(config=config, title=title, verbose=verbose, session=session) + return '' def cleanXML(xml=''): """ Trim redundant info """ @@ -710,7 +944,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): if c % 10 == 0: print 'Downloaded %d pages' % (c) try: - for xml in getXMLPage(config=config, title=title, session=session): + for xml in getXMLPage_(config=config, title=title, session=session): xml = cleanXML(xml=xml) xmlfile.write(xml.encode('utf-8')) except PageMissingError: @@ -1211,9 +1445,8 @@ def welcome(): message += '' message += "\n" message += "#" * 73 - message += "\n" - message += "# Copyright (C) 2011-%d WikiTeam developers #\n" % (datetime.datetime.now().year) message += """ +# Copyright (C) 2011-2014 WikiTeam # # This program is free software: you can redistribute it and/or modify # # it under the terms of the GNU General Public License as published by # # the Free Software Foundation, either version 3 of the License, or # @@ -1311,6 +1544,8 @@ def getParameters(params=[]): '--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude') + groupDownload.add_argument( + '--apiexport', action='store_true', help="Using API instead of Special:Export to export pages") # Meta info params groupMeta = parser.add_argument_group( @@ -1494,6 +1729,7 @@ def getParameters(params=[]): 'cookies': args.cookies or '', 'delay': args.delay, 'retries': int(args.retries), + 'apiexport' : args.apiexport } other = { From 8bc01207a810104d9271597f252d1989408311d5 Mon Sep 17 00:00:00 2001 From: gyc990326 Date: Wed, 5 Oct 2016 11:17:35 +0800 Subject: [PATCH 2/2] Add files via upload Merge Changes --- dumpgenerator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 3bada739..851b7bc3 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1,6 +1,5 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -# encoding=utf8 # dumpgenerator.py A generator of dumps for wikis # Copyright (C) 2011-2016 WikiTeam developers @@ -1445,8 +1444,9 @@ def welcome(): message += '' message += "\n" message += "#" * 73 + message += "\n" + message += "# Copyright (C) 2011-%d WikiTeam developers #\n" % (datetime.datetime.now().year) message += """ -# Copyright (C) 2011-2014 WikiTeam # # This program is free software: you can redistribute it and/or modify # # it under the terms of the GNU General Public License as published by # # the Free Software Foundation, either version 3 of the License, or #