#!/usr/bin/env python """ Requires Python 2.5 Converting between python bytecode and XML. These functions are probably most interesting for users: * Disassembler: - xml_code() code object --> XML - xml_pyc() pyc file --> XML * Assembler: - code_xml() XML --> code object - pyc_xml() XML --> pyc file The XML file is almost self documenting. The assembly language is of course somewhat redundant. Therefore, only the column containing the name of the opcode and the column containing the optional argument is by the assembler. So for example: and will be assembled to the same bytecode. Blank lines and lines starting with a '#' will be ignored. Only the assembler is implemented in the source code below, because the disassembler is already in the 'dis'-module. When assembling, all XML attributes are ignored, these attributes are only inserted by the disassembler to make the XML more readable. """ __author__ = 'Ilan Schnell ' __version__ = '0.2' import dis, marshal, new import struct, sys, time import random, string, cStringIO, re from types import CodeType from xml.etree import ElementTree as ET XMLindentation = 4 * " " def indent(elem, level=0): "Adds whitespace to the tree, so that it results in a prettyprinted tree." i = "\n" + level * XMLindentation if len(elem): if not elem.text or not elem.text.strip(): elem.text = i + XMLindentation for e in elem: indent(e, level+1) if not e.tail or not e.tail.strip(): e.tail = i + XMLindentation if not e.tail or not e.tail.strip(): e.tail = i else: if level and (not elem.tail or not elem.tail.strip()): elem.tail = i magic = "".join(random.choice(string.letters + string.digits) for i in xrange(40)) hex_pat = re.compile(r'0x[0-9a-fA-F]{2,}') def disassemble(code): """Given a code object, return output from dis.disassemble as a string. (dis.disassemble writes its output to stdout.)""" rem = sys.stdout sys.stdout = cStringIO.StringIO() dis.disassemble(code) ret = sys.stdout.getvalue() ret = ret.replace(']]>', ']]X>') sys.stdout = rem return hex_pat.sub('0x...', ret) def elem_code(obj, n=None): """Given a code object, return the ET.Element which represents the code in a more human readable form.""" attrib = {'n': str(n)} if n >= 0 else {} if type(obj) == CodeType: elem = ET.Element('code', attrib) elem.attrib.update(name=obj.co_name) tags = [n for n in dir(obj) if n.startswith('co_')] tags.sort() tags.remove('co_consts') for tag in tags + ['co_consts']: name = tag[3:] caelem = ET.Element(tag) ca = getattr(obj, tag) if name in 'filename name'.split(): caelem.text = ca elif name in 'argcount firstlineno nlocals stacksize'.split(): caelem.text = str(ca) elif name in 'cellvars freevars names varnames consts'.split(): for i, item in enumerate(ca): caelem.append(elem_code(item, i)) elif name == 'flags': caelem.text = '0x%04x' % ca elif name == 'lnotab': caelem.text = '0x' + ca.encode('hex') elif name == 'code': caelem.text = ('[%s[\n' % magic) + disassemble(obj) + \ (16*' '+']%s]' % magic) else: raise 'Attribute %r unknown for code object' % tag if caelem is not None: elem.append(caelem) else: elem = ET.Element('item', attrib) elem.text = repr(obj) return elem def xmlpostprocess(xml): pat = re.compile(r'\[%s\[.*?\]%s\]' % (magic, magic), re.DOTALL) def repl(match): s = match.group() s = s.replace('<', '<') s = s.replace('>', '>') s = s.replace('&', '&') return s ret = pat.sub(repl, xml) ret = ret.replace('[%s[' % magic, '') return '''\ ''' + ret + '\n' def xml_code(code): "Given a code object, return corresponding XML as single string" elem = ET.Element('bytecode') elem.append(elem_code(code)) indent(elem) return xmlpostprocess(ET.tostring(elem)) def headET(f): "Given file object, return an ET.Element representing the header" head = ET.Element('head') e = ET.Element('magic') e.text = '0x' + f.read(4).encode('hex') head.append(e) e = ET.Element('modtime') e.text = time.asctime(time.localtime(struct.unpack('L', f.read(4))[0])) head.append(e) return head def xml_pyc(filename): """Given a filename of a python bytecode file, return corresponding XML as single string""" f = open(filename, 'rb') elem = ET.Element('bytecode') elem.append(headET(f)) elem.append(elem_code(marshal.load(f))) f.close() indent(elem) return xmlpostprocess(ET.tostring(elem)) # ------------ def assemble(text): lst = [] # list of integers 0..255 representing the bytecode for inst in text.splitlines(): if inst.count('\t'): print 'Warning: Found tabs in assembly section.' if (not inst.strip()) or inst.startswith('#'): continue cmd = inst[16:42].split() if not cmd: continue byteName = cmd[0] byteCode = dis.opmap[byteName] lst.append(byteCode) if byteCode >= dis.HAVE_ARGUMENT: if len(cmd) != 2: exit('Error: Opcode %r takes an argument' % byteName) intArg = int(cmd[1]) lst.append(intArg % 256) lst.append(intArg / 256) elif len(cmd) != 1: exit('Error: Opcode %r takes no argument' % byteName) return ''.join(chr(n) for n in lst) def code_elem(elem): "Given an ET.Element, return the representing code object." if elem.tag == 'code': args = '''argcount nlocals stacksize flags code consts names varnames filename name firstlineno lnotab'''.split() return new.code(*[code_elem(elem.find('co_'+arg)) for arg in args]) elif elem.tag.startswith('co_'): text = elem.text name = elem.tag[3:] if name in 'filename name'.split(): return text elif name in 'argcount firstlineno nlocals stacksize'.split(): return int(text) elif name in 'cellvars freevars names varnames consts'.split(): return tuple(code_elem(e) for e in elem) elif name == 'flags': assert len(text) == 6 assert text[:2] == '0x' return int(text[2:], 16) elif name == 'lnotab': assert text[:2] == '0x' return text[2:].decode('hex') elif name == 'code': return assemble(text) else: return eval(elem.text) def code_xml(xml): "Convert a single string containing XML into code object" return code_elem(ET.XML(xml).find('code')) def pyc_xml(xml, filename): """Convert a single string containing XML into python bytecode and write this bytecode to file""" elem = ET.XML(xml) if not elem.find('head'): exit('Error: missing in XML file, cannot convert to pyc file.') f = open(filename, 'wb') magic = elem.find('head/magic').text assert magic[:2] == '0x' f.write(magic[2:].decode('hex')) timestr = elem.find('head/modtime').text f.write(struct.pack('L', time.mktime(time.strptime(timestr)))) marshal.dump(code_elem(elem.find('code')), f) f.close() def test(filename): import compiler co1 = compile(open(filename).read(), '', 'exec') xml1 = xml_code(co1) co2 = code_xml(xml1) xml2 = xml_code(co2) assert xml1 == xml2 def main(): from optparse import OptionParser parser = OptionParser(usage = "usage: %prog [options] FILE", description = """\ Disassembler and assembler - convert between python bytecode and XML. Files are handled by their extension: (i) .xml is assembled to .pyc (ii) .pyc, .pyo bytecode files disassembled to .xml (iii) otherwise python source is assumed and (internally) compiled to bytecode before being disassembled into XML, missing the header information. """) parser.add_option("-o", "--output", action = "store", help = "Specify output filename explicitly. " "By default, the output filename is obtained " "by substituting the extension.") parser.add_option("-t", "--test", action = "store_true", help = "self test") options, args = parser.parse_args() if options.test: test(__file__) exit('OK') if not len(args) == 1: parser.error("incorrect number of arguments") filein = args[0] if filein.endswith('.xml'): # .xml --> .pyc if options.output: fileout = options.output else: fileout = filein[:-4] + '.pyc' print 'Assembling:', filein, ' --> ', fileout pyc_xml(open(filein).read(), fileout) else: # --> .xml if filein.endswith(('.pyc', '.pyo')): xml = xml_pyc(filein) fileout = filein[:-4] + '.xml' else: xml = xml_code(compile(open(filein).read(), filein, 'exec')) if filein.endswith('.py'): fileout = filein[:-3] + '.xml' else: fileout = filein + '.xml' if options.output: fileout = options.output print 'Disassembling:', filein, ' --> ', fileout fo = open(fileout, 'w') fo.write(xml) fo.close() if __name__ == "__main__": main() # Local Variables: # mode: python # End: