notes-20160811-code
p { color: red }
#!/usr/bin/python
# from http://www.reades.com/2012/11/29/mb-archives/
# From Jon Reades, blog entry 29 Nov. 2012 "Extracting files from Moodle MBZ Archives with Python"
#
# rename the .mbz file to end in .zip, then unzip
#
# Configure the Python script below with the appropriate parameters:
# destination: path to where you want the Moodle files saved
# source: path to where the unpacked MBZ file is stored
# pattern: types of files you want to extract (the ones listed here are fairly comprehensive,
# and if you need more then you probably know enough to adjust the regex)
#
# Run the script and check that you've picked up all the content you needed.
#
# Note that there are a few limitations to this script:
#
# It doesn't preserve any hierarchy from the Moodle archive (so if there are folders and subfolders in the backup you will lose this)
# It doesn't deal with files that have the same name - in this case it will blindly overwrite the first occurrence of a file with the same name using the second file of the same name
#
# 2016.08.11 Note from GQMJr:
# One of the ways to quickly correct the problem of nonuniqueness of filenames is to include the "itemid" field or file is in the final filename
#
# The entries has the form:
#
#
# d332f83ec4eabce53214fe036329b4f9c91b0b9a
# 6652
# mod_resource
# content
# 0
# /
# foo-filename.pdf
# 3
# 111973
# application/pdf
# 0
# 1426863384
# 1426863389
# talk-guidelines.pdf
# Firstname Lastname
# allrightsreserved
# 1
# $@NULL@$
# $@NULL@$
# $@NULL@$
#
import xml.etree.ElementTree as etree
import fnmatch
import shutil
import os
import re
def locate(pattern, root=os.curdir):
'''Locate all files matching supplied filename pattern in and below
supplied root directory.'''
for path, dirs, files in os.walk(os.path.abspath(root)):
for filename in fnmatch.filter(files, pattern):
yield os.path.join(path, filename)
#destination = '/Users/foobar/Desktop/Moodle Copy/'
destination = './Moodle_copy/'
#source = '/Users/foobar/Desktop/Moodle Backup/'
source = './Moodle_backup/'
# GQMjr added gz|tar|jpg|log|tgz
pattern = re.compile('^\s*(.+\.(?:pdf|png|zip|gz|tar|jpg|log|tgz|rtf|sav|mp3|mht|por|xlsx?|docx?|pptx?))\s*$', flags=re.IGNORECASE)
tree = etree.parse(source + 'files.xml')
root = tree.getroot()
counter = 1
print "Root: ", root
for rsrc in root:
#print "Child id: ", rsrc.attrib
contextid = rsrc.find('contextid').text #GQMjr
fhash = rsrc.find('contenthash').text
fname = rsrc.find('filename').text
#print "\tcontextid:'", contextid, "'" #GQMjr
#print "\tHash: '", fhash, "'"
#print "\tName: '", fname, "'"
hit = pattern.search(fname)
if hit:
#print "\tMatch: ", hit.group(1)
files = locate(fhash, source)
#print "\tFiles: ", files
for x in files:
#print "Copying: ", x #GQMjr
#shutil.copyfile(x, destination + fname) #GQMjr
if not os.path.exists(destination + contextid): #GQMjr
os.makedirs(destination + contextid) #GQMjr
if os.path.isfile(destination + contextid +'/' + fname): #GQMjr
shutil.copyfile(x, destination + contextid +'/' + fname + '-' + str(counter)) #GQMjr
counter=counter+1 #GQMjr
else:
shutil.copyfile(x, destination + contextid +'/' + fname) #GQMjr
else:
if fname == '.': # a . directory - nothing to do #GQMjr
continue #GQMjr
print "No Match: '", fname, "'"