diff --git a/dumpgenerator.py b/dumpgenerator.py
index a045ace5..851b7bc3 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -23,10 +23,20 @@
from kitchen.text.converters import getwriter
except ImportError:
print "Please install the kitchen module."
+
+try:
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+import xml.dom.minidom as MD
+
import cookielib
import cPickle
import datetime
import sys
+import io
+import traceback
try:
import argparse
except ImportError:
@@ -436,8 +446,12 @@ def getXMLHeader(config={}, session=None):
# similar to: does not exist. Not a problem, if we get the .
xml = pme.xml
@@ -458,12 +472,12 @@ def getXMLHeader(config={}, session=None):
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
- xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+ xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
pass
-
+ config['curonly'] = curonly
header = xml.split('')[0]
if not re.match(r"\s*' if not config['curonly'] else r'', xml) or re.search(r'', xml):
+ if c > 0 and c < maxretries:
+ wait = increment * c < maxseconds and increment * \
+ c or maxseconds # incremental until maxseconds
+ print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiexport'] else 'pages'], wait)
+ time.sleep(wait)
+ # reducing server load requesting smallest chunks (if curonly then
+ # rvlimit = 1 from mother function)
+ if params['rvlimit'] > 1:
+ params['rvlimit'] = params['rvlimit'] / 2 # half
+ if c >= maxretries:
+ print ' We have retried %d times' % (c)
+ print ' MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiexport'] else 'pages'])
+ # If it's not already what we tried: our last chance, preserve only the last revision...
+ # config['curonly'] means that the whole dump is configured to save only the last,
+ # params['curonly'] should mean that we've already tried this
+ # fallback, because it's set by the following if and passed to
+ # getXMLPageCore
+ # TODO: save only the last version when failed
+ print ' Saving in the errors log, and skipping...'
+ logerror(
+ config=config,
+ text=u'Error while retrieving the last revision of "%s". Skipping.' %
+ (params['titles' if config['apiexport'] else 'pages']).decode('utf-8'))
+ #raise ExportAbortedError(config['index'])
+ return '' # empty xml
+
+ # FIXME HANDLE HTTP Errors HERE
+ try:
+ r = session.get(url=config['api'], params=params, headers=headers)
+ handleStatusCode(r)
+ xml = fixBOM(r)
+ #print xml
+ except requests.exceptions.ConnectionError as e:
+ print ' Connection error: %s'%(str(e[0]))
+ xml = ''
+ c += 1
+ return xml
+
+
+def getXMLPageWithApi(config={}, title='', verbose=True, session=None):
+ """ Get the full history (or current only) of a page using API:Query
+ if params['curonly'] is set, then using export&exportwrap to export
+ """
+
+ title_ = title
+ title_ = re.sub(' ', '_', title_)
+ # do not convert & into %26, title_ = re.sub('&', '%26', title_)
+ # action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
+ # &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
+ #print 'current:%s' % (title_)
+ if not config['curonly']:
+ params = {'titles': title_, 'action': 'query','format':'xml',
+ 'prop':'revisions',
+ 'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags',
+ 'rvcontinue' : None,
+ 'rvlimit' : 10 # TODO: set this by commandline
+ }
+ else:
+ params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1}
+ #print 'params:%s' % (params)
+ if not config['curonly']:
+ firstpartok = False
+ lastcontinue = None
+ numberofedits = 0
+ ret = ''
+ while True:
+ # in case the last request is not right, saving last time's progress
+ if not firstpartok:
+ try:
+ lastcontinue = params['rvcontinue']
+ except:
+ lastcontinue = None
+
+ xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+ if xml == "":
+ #just return so that we can continue, and getXMLPageCoreWithApi will log the error
+ return
+ try:
+ root = ET.fromstring(xml.encode('utf-8'))
+ except:
+ continue
+ try:
+ retpage = root.find('query').find('pages').find('page')
+ except:
+ continue
+ if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'):
+ print 'Page not found'
+ raise PageMissingError(params['titles'], xml)
+ if not firstpartok:
+ try:
+ # build the firstpart by ourselves to improve the memory usage
+ ret = ' \n'
+ ret += ' %s\n' %(retpage.attrib['title'])
+ ret += ' %s\n' % (retpage.attrib['ns'])
+ ret += ' %s\n' % (retpage.attrib['pageid'])
+ except:
+ firstpartok = False
+ continue
+ else:
+ firstpartok = True
+ yield ret
+ try:
+ ret = ''
+ edits = 0
+ if config['curonly'] or root.find('continue') == None:
+ # transform the revision
+ rev_,edits = reconstructRevisions(root=root)
+ xmldom = MD.parseString(''+ET.tostring(rev_)+'')
+ # convert it into text in case it throws MemoryError
+ # delete the first three line and last two line,which is for setting the indent
+ ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
+ yield ret
+ numberofedits += edits
+ break
+ else:
+ rev_,edits = reconstructRevisions(root=root)
+ xmldom = MD.parseString('' + ET.tostring(rev_) + '')
+ ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
+ params['rvcontinue'] = root.find('continue').attrib['rvcontinue']
+ numberofedits += edits
+ yield ret
+ except:
+ traceback.print_exc()
+ params['rvcontinue'] = lastcontinue
+ ret = ''
+ yield ' \n'
+ else:
+ xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+ if xml == "":
+ raise ExportAbortedError(config['index'])
+ if not "" in xml:
+ raise PageMissingError(params['titles'], xml)
+ else:
+ # strip these sha1s sums which keep showing up in the export and
+ # which are invalid for the XML schema (they only apply to
+ # revisions)
+ xml = re.sub(r'\n\s*\w+\s*\n', r'\n', xml)
+ xml = re.sub(r'\n\s*\s*\n', r'\n', xml)
+
+ yield xml.split("")[0]
+
+ # just for looking good :)
+ r_timestamp = r'([^<]+)'
+
+ numberofedits = 0
+ numberofedits += len(re.findall(r_timestamp, xml))
+
+ yield "\n"
+
+ if verbose:
+ if (numberofedits == 1):
+ print ' %s, 1 edit' % (title.strip())
+ else:
+ print ' %s, %d edits' % (title.strip(), numberofedits)
def getXMLPageCore(headers={}, params={}, config={}, session=None):
""" """
@@ -663,6 +889,13 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
else:
print ' %s, %d edits' % (title.strip(), numberofedits)
+def getXMLPage_(config={}, title='', verbose=True, session=None):
+ #print config
+ if config['apiexport']:
+ return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session)
+ else:
+ return getXMLPage(config=config, title=title, verbose=verbose, session=session)
+ return ''
def cleanXML(xml=''):
""" Trim redundant info """
@@ -710,7 +943,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
- for xml in getXMLPage(config=config, title=title, session=session):
+ for xml in getXMLPage_(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
@@ -1311,6 +1544,8 @@ def getParameters(params=[]):
'--exnamespaces',
metavar="1,2,3",
help='comma-separated value of namespaces to exclude')
+ groupDownload.add_argument(
+ '--apiexport', action='store_true', help="Using API instead of Special:Export to export pages")
# Meta info params
groupMeta = parser.add_argument_group(
@@ -1494,6 +1729,7 @@ def getParameters(params=[]):
'cookies': args.cookies or '',
'delay': args.delay,
'retries': int(args.retries),
+ 'apiexport' : args.apiexport
}
other = {