437 lines
17 KiB
Python
437 lines
17 KiB
Python
"""Very simple and fast XML parser, used for intra-paragraph text.
|
|
|
|
Devised by Aaron Watters in the bad old days before Python had fast
|
|
parsers available. Constructs the lightest possible in-memory
|
|
representation; parses most files we have seen in pure python very
|
|
quickly.
|
|
|
|
The output structure is the same as the one produced by pyRXP,
|
|
our validating C-based parser, which was written later. It will
|
|
use pyRXP if available.
|
|
|
|
This is used to parse intra-paragraph markup.
|
|
|
|
Example parse::
|
|
|
|
<this type="xml">text <b>in</b> xml</this>
|
|
|
|
( "this",
|
|
{"type": "xml"},
|
|
[ "text ",
|
|
("b", None, ["in"], None),
|
|
" xml"
|
|
]
|
|
None )
|
|
|
|
{ 0: "this"
|
|
"type": "xml"
|
|
1: ["text ",
|
|
{0: "b", 1:["in"]},
|
|
" xml"]
|
|
}
|
|
|
|
Ie, xml tag translates to a tuple:
|
|
(name, dictofattributes, contentlist, miscellaneousinfo)
|
|
|
|
where miscellaneousinfo can be anything, (but defaults to None)
|
|
(with the intention of adding, eg, line number information)
|
|
|
|
special cases: name of "" means "top level, no containing tag".
|
|
Top level parse always looks like this::
|
|
|
|
("", list, None, None)
|
|
|
|
contained text of None means <simple_tag/>
|
|
|
|
In order to support stuff like::
|
|
|
|
<this></this><one></one>
|
|
|
|
AT THE MOMENT & ETCETERA ARE IGNORED. THEY MUST BE PROCESSED
|
|
IN A POST-PROCESSING STEP.
|
|
|
|
PROLOGUES ARE NOT UNDERSTOOD. OTHER STUFF IS PROBABLY MISSING.
|
|
"""
|
|
|
|
RequirePyRXP = 0 # set this to 1 to disable the nonvalidating fallback parser.
|
|
|
|
try:
|
|
#raise ImportError, "dummy error"
|
|
simpleparse = 0
|
|
import pyRXPU
|
|
def warnCB(s):
|
|
print(s)
|
|
pyRXP_parser = pyRXPU.Parser(
|
|
ErrorOnValidityErrors=1,
|
|
NoNoDTDWarning=1,
|
|
ExpandCharacterEntities=1,
|
|
ExpandGeneralEntities=1,
|
|
warnCB = warnCB,
|
|
srcName='string input',
|
|
ReturnUTF8 = 0,
|
|
)
|
|
def parsexml(xmlText, oneOutermostTag=0,eoCB=None,entityReplacer=None,parseOpts={}):
|
|
pyRXP_parser.eoCB = eoCB
|
|
p = pyRXP_parser.parse(xmlText,**parseOpts)
|
|
return oneOutermostTag and p or ('',None,[p],None)
|
|
except ImportError:
|
|
simpleparse = 1
|
|
|
|
NONAME = ""
|
|
NAMEKEY = 0
|
|
CONTENTSKEY = 1
|
|
CDATAMARKER = "<![CDATA["
|
|
LENCDATAMARKER = len(CDATAMARKER)
|
|
CDATAENDMARKER = "]]>"
|
|
replacelist = [("<", "<"), (">", ">"), ("&", "&")] # amp must be last
|
|
#replacelist = []
|
|
def unEscapeContentList(contentList):
|
|
result = []
|
|
for e in contentList:
|
|
if "&" in e:
|
|
for (old, new) in replacelist:
|
|
e = e.replace(old, new)
|
|
result.append(e)
|
|
return result
|
|
|
|
def parsexmlSimple(xmltext, oneOutermostTag=0,eoCB=None,entityReplacer=unEscapeContentList):
|
|
"""official interface: discard unused cursor info"""
|
|
if RequirePyRXP:
|
|
raise ImportError("pyRXP not found, fallback parser disabled")
|
|
(result, cursor) = parsexml0(xmltext,entityReplacer=entityReplacer)
|
|
if oneOutermostTag:
|
|
return result[2][0]
|
|
else:
|
|
return result
|
|
|
|
if simpleparse:
|
|
parsexml = parsexmlSimple
|
|
|
|
def parseFile(filename):
|
|
raw = open(filename, 'r').read()
|
|
return parsexml(raw)
|
|
|
|
verbose = 0
|
|
|
|
def skip_prologue(text, cursor):
|
|
"""skip any prologue found after cursor, return index of rest of text"""
|
|
### NOT AT ALL COMPLETE!!! definitely can be confused!!!
|
|
prologue_elements = ("!DOCTYPE", "?xml", "!--")
|
|
done = None
|
|
while done is None:
|
|
#print "trying to skip:", repr(text[cursor:cursor+20])
|
|
openbracket = text.find("<", cursor)
|
|
if openbracket<0: break
|
|
past = openbracket+1
|
|
found = None
|
|
for e in prologue_elements:
|
|
le = len(e)
|
|
if text[past:past+le]==e:
|
|
found = 1
|
|
cursor = text.find(">", past)
|
|
if cursor<0:
|
|
raise ValueError("can't close prologue %r" % e)
|
|
cursor = cursor+1
|
|
if found is None:
|
|
done=1
|
|
#print "done skipping"
|
|
return cursor
|
|
|
|
def parsexml0(xmltext, startingat=0, toplevel=1,
|
|
# snarf in some globals
|
|
entityReplacer=unEscapeContentList,
|
|
#len=len, None=None
|
|
#LENCDATAMARKER=LENCDATAMARKER, CDATAMARKER=CDATAMARKER
|
|
):
|
|
"""simple recursive descent xml parser...
|
|
return (dictionary, endcharacter)
|
|
special case: comment returns (None, endcharacter)"""
|
|
#print "parsexml0", repr(xmltext[startingat: startingat+10])
|
|
# DEFAULTS
|
|
NameString = NONAME
|
|
ContentList = AttDict = ExtraStuff = None
|
|
if toplevel is not None:
|
|
#if verbose: print "at top level"
|
|
#if startingat!=0:
|
|
# raise ValueError, "have to start at 0 for top level!"
|
|
xmltext = xmltext.strip()
|
|
cursor = startingat
|
|
#look for interesting starting points
|
|
firstbracket = xmltext.find("<", cursor)
|
|
afterbracket2char = xmltext[firstbracket+1:firstbracket+3]
|
|
#print "a", repr(afterbracket2char)
|
|
#firstampersand = xmltext.find("&", cursor)
|
|
#if firstampersand>0 and firstampersand<firstbracket:
|
|
# raise ValueError, "I don't handle ampersands yet!!!"
|
|
docontents = 1
|
|
if firstbracket<0:
|
|
# no tags
|
|
#if verbose: print "no tags"
|
|
if toplevel is not None:
|
|
#D = {NAMEKEY: NONAME, CONTENTSKEY: [xmltext[cursor:]]}
|
|
ContentList = [xmltext[cursor:]]
|
|
if entityReplacer: ContentList = entityReplacer(ContentList)
|
|
return (NameString, AttDict, ContentList, ExtraStuff), len(xmltext)
|
|
else:
|
|
raise ValueError("no tags at non-toplevel %s" % repr(xmltext[cursor:cursor+20]))
|
|
#D = {}
|
|
L = []
|
|
# look for start tag
|
|
# NEED to force always outer level is unnamed!!!
|
|
#if toplevel and firstbracket>0:
|
|
#afterbracket2char = xmltext[firstbracket:firstbracket+2]
|
|
if toplevel is not None:
|
|
#print "toplevel with no outer tag"
|
|
NameString = name = NONAME
|
|
cursor = skip_prologue(xmltext, cursor)
|
|
#break
|
|
elif firstbracket<0:
|
|
raise ValueError("non top level entry should be at start tag: %s" % repr(xmltext[:10]))
|
|
# special case: CDATA
|
|
elif afterbracket2char=="![" and xmltext[firstbracket:firstbracket+9]=="<![CDATA[":
|
|
#print "in CDATA", cursor
|
|
# skip straight to the close marker
|
|
startcdata = firstbracket+9
|
|
endcdata = xmltext.find(CDATAENDMARKER, startcdata)
|
|
if endcdata<0:
|
|
raise ValueError("unclosed CDATA %s" % repr(xmltext[cursor:cursor+20]))
|
|
NameString = CDATAMARKER
|
|
ContentList = [xmltext[startcdata: endcdata]]
|
|
cursor = endcdata+len(CDATAENDMARKER)
|
|
docontents = None
|
|
# special case COMMENT
|
|
elif afterbracket2char=="!-" and xmltext[firstbracket:firstbracket+4]=="<!--":
|
|
#print "in COMMENT"
|
|
endcommentdashes = xmltext.find("--", firstbracket+4)
|
|
if endcommentdashes<firstbracket:
|
|
raise ValueError("unterminated comment %s" % repr(xmltext[cursor:cursor+20]))
|
|
endcomment = endcommentdashes+2
|
|
if xmltext[endcomment]!=">":
|
|
raise ValueError("invalid comment: contains double dashes %s" % repr(xmltext[cursor:cursor+20]))
|
|
return (None, endcomment+1) # shortcut exit
|
|
else:
|
|
# get the rest of the tag
|
|
#if verbose: print "parsing start tag"
|
|
# make sure the tag isn't in doublequote pairs
|
|
closebracket = xmltext.find(">", firstbracket)
|
|
noclose = closebracket<0
|
|
startsearch = closebracket+1
|
|
pastfirstbracket = firstbracket+1
|
|
tagcontent = xmltext[pastfirstbracket:closebracket]
|
|
# shortcut, no equal means nothing but name in the tag content
|
|
if '=' not in tagcontent:
|
|
if tagcontent[-1]=="/":
|
|
# simple case
|
|
#print "simple case", tagcontent
|
|
tagcontent = tagcontent[:-1]
|
|
docontents = None
|
|
name = tagcontent.strip()
|
|
NameString = name
|
|
cursor = startsearch
|
|
else:
|
|
if '"' in tagcontent:
|
|
# check double quotes
|
|
stop = None
|
|
# not inside double quotes! (the split should have odd length)
|
|
if noclose or len((tagcontent+".").split('"'))% 2:
|
|
stop=1
|
|
while stop is None:
|
|
closebracket = xmltext.find(">", startsearch)
|
|
startsearch = closebracket+1
|
|
noclose = closebracket<0
|
|
tagcontent = xmltext[pastfirstbracket:closebracket]
|
|
# not inside double quotes! (the split should have odd length)
|
|
if noclose or len((tagcontent+".").split('"'))% 2:
|
|
stop=1
|
|
if noclose:
|
|
raise ValueError("unclosed start tag %s" % repr(xmltext[firstbracket:firstbracket+20]))
|
|
cursor = startsearch
|
|
#cursor = closebracket+1
|
|
# handle simple tag /> syntax
|
|
if xmltext[closebracket-1]=="/":
|
|
#if verbose: print "it's a simple tag"
|
|
closebracket = closebracket-1
|
|
tagcontent = tagcontent[:-1]
|
|
docontents = None
|
|
#tagcontent = xmltext[firstbracket+1:closebracket]
|
|
tagcontent = tagcontent.strip()
|
|
taglist = tagcontent.split("=")
|
|
#if not taglist:
|
|
# raise ValueError, "tag with no name %s" % repr(xmltext[firstbracket:firstbracket+20])
|
|
taglist0 = taglist[0]
|
|
taglist0list = taglist0.split()
|
|
#if len(taglist0list)>2:
|
|
# raise ValueError, "bad tag head %s" % repr(taglist0)
|
|
name = taglist0list[0]
|
|
#print "tag name is", name
|
|
NameString = name
|
|
# now parse the attributes
|
|
attributename = taglist0list[-1]
|
|
# put a fake att name at end of last taglist entry for consistent parsing
|
|
taglist[-1] = taglist[-1]+" f"
|
|
AttDict = D = {}
|
|
taglistindex = 1
|
|
lasttaglistindex = len(taglist)
|
|
#for attentry in taglist[1:]:
|
|
while taglistindex<lasttaglistindex:
|
|
#print "looking for attribute named", attributename
|
|
attentry = taglist[taglistindex]
|
|
taglistindex = taglistindex+1
|
|
attentry = attentry.strip()
|
|
if attentry[0]!='"':
|
|
raise ValueError("attribute value must start with double quotes" + repr(attentry))
|
|
while '"' not in attentry[1:]:
|
|
# must have an = inside the attribute value...
|
|
if taglistindex>lasttaglistindex:
|
|
raise ValueError("unclosed value " + repr(attentry))
|
|
nextattentry = taglist[taglistindex]
|
|
taglistindex = taglistindex+1
|
|
attentry = "%s=%s" % (attentry, nextattentry)
|
|
attentry = attentry.strip() # only needed for while loop...
|
|
attlist = attentry.split()
|
|
nextattname = attlist[-1]
|
|
attvalue = attentry[:-len(nextattname)]
|
|
attvalue = attvalue.strip()
|
|
try:
|
|
first = attvalue[0]; last=attvalue[-1]
|
|
except:
|
|
raise ValueError("attvalue,attentry,attlist="+repr((attvalue, attentry,attlist)))
|
|
if first==last=='"' or first==last=="'":
|
|
attvalue = attvalue[1:-1]
|
|
#print attributename, "=", attvalue
|
|
D[attributename] = attvalue
|
|
attributename = nextattname
|
|
# pass over other tags and content looking for end tag
|
|
if docontents is not None:
|
|
#print "now looking for end tag"
|
|
ContentList = L
|
|
while docontents is not None:
|
|
nextopenbracket = xmltext.find("<", cursor)
|
|
if nextopenbracket<cursor:
|
|
#if verbose: print "no next open bracket found"
|
|
if name==NONAME:
|
|
#print "no more tags for noname", repr(xmltext[cursor:cursor+10])
|
|
docontents=None # done
|
|
remainder = xmltext[cursor:]
|
|
cursor = len(xmltext)
|
|
if remainder:
|
|
L.append(remainder)
|
|
else:
|
|
raise ValueError("no close bracket for %s found after %s" % (name,repr(xmltext[cursor: cursor+20])))
|
|
# is it a close bracket?
|
|
elif xmltext[nextopenbracket+1]=="/":
|
|
#print "found close bracket", repr(xmltext[nextopenbracket:nextopenbracket+20])
|
|
nextclosebracket = xmltext.find(">", nextopenbracket)
|
|
if nextclosebracket<nextopenbracket:
|
|
raise ValueError("unclosed close tag %s" % repr(xmltext[nextopenbracket: nextopenbracket+20]))
|
|
closetagcontents = xmltext[nextopenbracket+2: nextclosebracket]
|
|
closetaglist = closetagcontents.split()
|
|
#if len(closetaglist)!=1:
|
|
#print closetagcontents
|
|
#raise ValueError, "bad close tag format %s" % repr(xmltext[nextopenbracket: nextopenbracket+20])
|
|
# name should match
|
|
closename = closetaglist[0]
|
|
#if verbose: print "closetag name is", closename
|
|
if name!=closename:
|
|
prefix = xmltext[:cursor]
|
|
endlinenum = len(prefix.split("\n"))
|
|
prefix = xmltext[:startingat]
|
|
linenum = len(prefix.split("\n"))
|
|
raise ValueError("at lines %s...%s close tag name doesn't match %s...%s %s" %(
|
|
linenum, endlinenum, repr(name), repr(closename), repr(xmltext[cursor: cursor+100])))
|
|
remainder = xmltext[cursor:nextopenbracket]
|
|
if remainder:
|
|
#if verbose: print "remainder", repr(remainder)
|
|
L.append(remainder)
|
|
cursor = nextclosebracket+1
|
|
#print "for", name, "found close tag"
|
|
docontents = None # done
|
|
# otherwise we are looking at a new tag, recursively parse it...
|
|
# first record any intervening content
|
|
else:
|
|
remainder = xmltext[cursor:nextopenbracket]
|
|
if remainder:
|
|
L.append(remainder)
|
|
#if verbose:
|
|
# #print "skipping", repr(remainder)
|
|
# #print "--- recursively parsing starting at", xmltext[nextopenbracket:nextopenbracket+20]
|
|
(parsetree, cursor) = parsexml0(xmltext, startingat=nextopenbracket, toplevel=None, entityReplacer=entityReplacer)
|
|
if parsetree:
|
|
L.append(parsetree)
|
|
# maybe should check for trailing garbage?
|
|
# toplevel:
|
|
# remainder = xmltext[cursor:].strip()
|
|
# if remainder:
|
|
# raise ValueError, "trailing garbage at top level %s" % repr(remainder[:20])
|
|
if ContentList:
|
|
if entityReplacer: ContentList = entityReplacer(ContentList)
|
|
t = (NameString, AttDict, ContentList, ExtraStuff)
|
|
return (t, cursor)
|
|
|
|
import types
|
|
def pprettyprint(parsedxml):
|
|
"""pretty printer mainly for testing"""
|
|
st = bytes
|
|
if type(parsedxml) is st:
|
|
return parsedxml
|
|
(name, attdict, textlist, extra) = parsedxml
|
|
if not attdict: attdict={}
|
|
attlist = []
|
|
for k in attdict.keys():
|
|
v = attdict[k]
|
|
attlist.append("%s=%s" % (k, repr(v)))
|
|
attributes = " ".join(attlist)
|
|
if not name and attributes:
|
|
raise ValueError("name missing with attributes???")
|
|
if textlist is not None:
|
|
# with content
|
|
textlistpprint = list(map(pprettyprint, textlist))
|
|
textpprint = "\n".join(textlistpprint)
|
|
if not name:
|
|
return textpprint # no outer tag
|
|
# indent it
|
|
nllist = textpprint.split("\n")
|
|
textpprint = " "+ ("\n ".join(nllist))
|
|
return "<%s %s>\n%s\n</%s>" % (name, attributes, textpprint, name)
|
|
# otherwise must be a simple tag
|
|
return "<%s %s/>" % (name, attributes)
|
|
|
|
dump = 0
|
|
def testparse(s):
|
|
from time import time
|
|
from pprint import pprint
|
|
now = time()
|
|
D = parsexmlSimple(s)
|
|
print("DONE", time()-now)
|
|
if dump&4:
|
|
pprint(D)
|
|
#pprint(D)
|
|
if dump&1:
|
|
print("============== reformatting")
|
|
p = pprettyprint(D)
|
|
print(p)
|
|
|
|
def test():
|
|
testparse("""<this type="xml">text <><b>in</b> <funnytag foo="bar"/> xml</this>
|
|
<!-- comment -->
|
|
<![CDATA[
|
|
<this type="xml">text <b>in</b> xml</this> ]]>
|
|
<tag with="<brackets in values>">just testing brackets feature</tag>
|
|
""")
|
|
|
|
filenames = [ #"../../reportlab/demos/pythonpoint/pythonpoint.xml",
|
|
"samples/hamlet.xml"]
|
|
|
|
#filenames = ["moa.xml"]
|
|
|
|
dump=1
|
|
if __name__=="__main__":
|
|
test()
|
|
from time import time
|
|
now = time()
|
|
for f in filenames:
|
|
t = open(f).read()
|
|
print("parsing", f)
|
|
testparse(t)
|
|
print("elapsed", time()-now)
|