# REX/Perl 1.0 # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions", # Technical Report TR 1998-17, School of Computing Science, Simon Fraser # University, November, 1998. # Copyright (c) 1998, Robert D. Cameron. # The following code may be freely used and distributed provided that # this copyright and citation notice remains intact and that modifications # or additions are clearly identified. # # 06Apr03 Brian Theado - Direct translation from Perl to Tcl set TextSE "\[^<]+" set UntilHyphen "\[^-]*-" set Until2Hyphens "${UntilHyphen}(?:\[^-]$UntilHyphen)*-" set CommentCE "${Until2Hyphens}>?" set UntilRSBs "\[^\\]]*](?:\[^\\]]+])*]+" set CDATA_CE "${UntilRSBs}(?:\[^\\]>]$UntilRSBs)*>" set S "\[ \\n\\t\\r]+" set NameStrt "\[A-Za-z_:]|\[^\\x00-\\x7F]" set NameChar "\[A-Za-z0-9_:.-]|\[^\\x00-\\x7F]" set Name "(?:$NameStrt)(?:$NameChar)*" set QuoteSE "\"\[^\"]*\"|'\[^']*'" set DT_IdentSE "$S${Name}(?:${S}(?:${Name}|$QuoteSE))*" set MarkupDeclCE "(?:\[^\\]\"'><]+|$QuoteSE)*>" set S1 "\[\\n\\r\\t ]" set UntilQMs "\[^?]*\\?+" set PI_Tail "\\?>|$S1${UntilQMs}(?:\[^>?]$UntilQMs)*>" set DT_ItemSE "<(?:!(?:--${Until2Hyphens}>|\[^-]$MarkupDeclCE)|\\?${Name}(?:$PI_Tail))|%$Name;|$S" set DocTypeCE "${DT_IdentSE}(?:$S)?(?:\\\[(?:$DT_ItemSE)*](?:$S)?)?>?" set DeclCE "--(?:$CommentCE)?|\\\[CDATA\\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?" set PI_CE "${Name}(?:$PI_Tail)?" set EndTagCE "${Name}(?:$S)?>?" set AttValSE "\"\[^<\"]*\"|'\[^<']*'" set ElemTagCE "${Name}(?:$S${Name}(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?" set MarkupSPE "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)" set XML_SPE "$TextSE|$MarkupSPE" proc ShallowParse {xml} { global XML_SPE return [regexp -inline -all $XML_SPE $xml] }if 0 {Example use:
% set xml {<html> <head> <title>XML Shallow Parsing with Regular Expressions</title> <meta http-equiv="Pragma" content="no-cache"></meta> <meta http-equiv="Expire" content="Mon, 04 Dec 1999 21:29:02 GMT"></meta> <link rel="stylesheet" href="http://wiki.tcl.tk/wikit.css" type="text/css"></link> <base href="http://wiki.tcl.tk/"> </head>} % ShallowParse $xml <html> { } <head> { } <title> {XML Shallow Parsing with Regular Expressions} </title> { } {<meta http-equiv="Pragma" content="no-cache">} </meta> { } {<meta http-equiv="Expire" content="Mon, 04 Dec 1999 21:29:02 GMT">} </meta> { } {<link rel="stylesheet" href="http://wiki.tcl.tk/wikit.css" type="text/css">} </link> { } {<base href="http://wiki.tcl.tk/">} { } </head>}if 0 {
}