Files
2025-07-26 05:58:59 +00:00

431 lines
14 KiB
Go

package xmlquery
import (
"bufio"
"bytes"
"encoding/xml"
"fmt"
"io"
"net/http"
"regexp"
"strings"
"sync"
"github.com/antchfx/xpath"
"golang.org/x/net/html/charset"
)
var xmlMIMERegex = regexp.MustCompile(`(?i)((application|image|message|model)/((\w|\.|-)+\+?)?|text/)(wb)?xml`)
// LoadURL loads the XML document from the specified URL.
func LoadURL(url string) (*Node, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
// Make sure the Content-Type has a valid XML MIME type
if xmlMIMERegex.MatchString(resp.Header.Get("Content-Type")) {
return Parse(resp.Body)
}
return nil, fmt.Errorf("invalid XML document(%s)", resp.Header.Get("Content-Type"))
}
// Parse returns the parse tree for the XML from the given Reader.
func Parse(r io.Reader) (*Node, error) {
return ParseWithOptions(r, ParserOptions{})
}
// ParseWithOptions is like parse, but with custom options
func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
p := createParser(r)
options.apply(p)
var err error
for err == nil {
_, err = p.parse()
}
if err == io.EOF {
// additional check for validity
// according to: https://www.w3.org/TR/xml
// the document MUST contain at least ONE element
valid := false
for doc := p.doc; doc != nil; doc = doc.NextSibling {
for node := doc.FirstChild; node != nil; node = node.NextSibling {
if node.Type == ElementNode {
valid = true
break
}
}
}
if !valid {
return nil, fmt.Errorf("xmlquery: invalid XML document")
}
return p.doc, nil
}
return nil, err
}
type parser struct {
decoder *xml.Decoder
doc *Node
level int
prev *Node
streamElementXPath *xpath.Expr // Under streaming mode, this specifies the xpath to the target element node(s).
streamElementFilter *xpath.Expr // If specified, it provides further filtering on the target element.
streamNode *Node // Need to remember the last target node So we can clean it up upon next Read() call.
streamNodePrev *Node // Need to remember target node's prev so upon target node removal, we can restore correct prev.
reader *cachedReader // Need to maintain a reference to the reader, so we can determine whether a node contains CDATA.
once sync.Once
space2prefix map[string]*xmlnsPrefix
}
type xmlnsPrefix struct {
name string
level int
}
func createParser(r io.Reader) *parser {
reader := newCachedReader(bufio.NewReader(r))
p := &parser{
decoder: xml.NewDecoder(reader),
doc: &Node{Type: DocumentNode},
level: 0,
reader: reader,
}
if p.decoder.CharsetReader == nil {
p.decoder.CharsetReader = charset.NewReaderLabel
}
p.prev = p.doc
return p
}
func (p *parser) parse() (*Node, error) {
p.once.Do(func() {
p.space2prefix = map[string]*xmlnsPrefix{"http://www.w3.org/XML/1998/namespace": {name: "xml", level: 0}}
})
var streamElementNodeCounter int
for {
p.reader.StartCaching()
tok, err := p.decoder.Token()
p.reader.StopCaching()
if err != nil {
return nil, err
}
switch tok := tok.(type) {
case xml.StartElement:
if p.level == 0 {
// mising XML declaration
attributes := make([]Attr, 1)
attributes[0].Name = xml.Name{Local: "version"}
attributes[0].Value = "1.0"
node := &Node{
Type: DeclarationNode,
Data: "xml",
Attr: attributes,
level: 1,
}
AddChild(p.prev, node)
p.level = 1
p.prev = node
}
for _, att := range tok.Attr {
if att.Name.Local == "xmlns" {
// https://github.com/antchfx/xmlquery/issues/67
if prefix, ok := p.space2prefix[att.Value]; !ok || (ok && prefix.level >= p.level) {
p.space2prefix[att.Value] = &xmlnsPrefix{name: "", level: p.level} // reset empty if exist the default namespace
}
} else if att.Name.Space == "xmlns" {
// maybe there are have duplicate NamespaceURL?
p.space2prefix[att.Value] = &xmlnsPrefix{name: att.Name.Local, level: p.level}
}
}
if space := tok.Name.Space; space != "" {
if _, found := p.space2prefix[space]; !found && p.decoder.Strict {
return nil, fmt.Errorf("xmlquery: invalid XML document, namespace %s is missing", space)
}
}
attributes := make([]Attr, len(tok.Attr))
for i, att := range tok.Attr {
name := att.Name
if prefix, ok := p.space2prefix[name.Space]; ok {
name.Space = prefix.name
}
attributes[i] = Attr{
Name: name,
Value: att.Value,
NamespaceURI: att.Name.Space,
}
}
node := &Node{
Type: ElementNode,
Data: tok.Name.Local,
NamespaceURI: tok.Name.Space,
Attr: attributes,
level: p.level,
}
if p.level == p.prev.level {
AddSibling(p.prev, node)
} else if p.level > p.prev.level {
AddChild(p.prev, node)
} else if p.level < p.prev.level {
for i := p.prev.level - p.level; i > 1; i-- {
p.prev = p.prev.Parent
}
AddSibling(p.prev.Parent, node)
}
if node.NamespaceURI != "" {
if v, ok := p.space2prefix[node.NamespaceURI]; ok {
cached := string(p.reader.CacheWithLimit(len(v.name) + len(node.Data) + 2))
if strings.HasPrefix(cached, fmt.Sprintf("%s:%s", v.name, node.Data)) || strings.HasPrefix(cached, fmt.Sprintf("<%s:%s", v.name, node.Data)) {
node.Prefix = v.name
}
}
}
// If we're in the streaming mode, we need to remember the node if it is the target node
// so that when we finish processing the node's EndElement, we know how/what to return to
// caller. Also we need to remove the target node from the tree upon next Read() call so
// memory doesn't grow unbounded.
if p.streamElementXPath != nil {
if p.streamNode == nil {
if QuerySelector(p.doc, p.streamElementXPath) != nil {
p.streamNode = node
p.streamNodePrev = p.prev
streamElementNodeCounter = 1
}
} else {
streamElementNodeCounter++
}
}
p.prev = node
p.level++
case xml.EndElement:
p.level--
// If we're in streaming mode, and we already have a potential streaming
// target node identified (p.streamNode != nil) then we need to check if
// this is the real one we want to return to caller.
if p.streamNode != nil {
streamElementNodeCounter--
if streamElementNodeCounter == 0 {
// Now we know this element node is the at least passing the initial
// p.streamElementXPath check and is a potential target node candidate.
// We need to have 1 more check with p.streamElementFilter (if given) to
// ensure it is really the element node we want.
// The reason we need a two-step check process is because the following
// situation:
// <AAA><BBB>b1</BBB></AAA>
// And say the p.streamElementXPath = "/AAA/BBB[. != 'b1']". Now during
// xml.StartElement time, the <BBB> node is still empty, so it will pass
// the p.streamElementXPath check. However, eventually we know this <BBB>
// shouldn't be returned to the caller. Having a second more fine-grained
// filter check ensures that. So in this case, the caller should really
// setup the stream parser with:
// streamElementXPath = "/AAA/BBB["
// streamElementFilter = "/AAA/BBB[. != 'b1']"
if p.streamElementFilter == nil || QuerySelector(p.doc, p.streamElementFilter) != nil {
return p.streamNode, nil
}
// otherwise, this isn't our target node, clean things up.
// note we also remove the underlying *Node from the node tree, to prevent
// future stream node candidate selection error.
RemoveFromTree(p.streamNode)
p.prev = p.streamNodePrev
p.streamNode = nil
p.streamNodePrev = nil
}
}
case xml.CharData:
// First, normalize the cache...
cached := bytes.ToUpper(p.reader.CacheWithLimit(9))
nodeType := TextNode
if bytes.HasPrefix(cached, []byte("<![CDATA[")) || bytes.HasPrefix(cached, []byte("![CDATA[")) {
nodeType = CharDataNode
}
node := &Node{Type: nodeType, Data: string(tok), level: p.level}
if p.level == p.prev.level {
AddSibling(p.prev, node)
} else if p.level > p.prev.level {
AddChild(p.prev, node)
} else if p.level < p.prev.level {
for i := p.prev.level - p.level; i > 1; i-- {
p.prev = p.prev.Parent
}
AddSibling(p.prev.Parent, node)
}
case xml.Comment:
node := &Node{Type: CommentNode, Data: string(tok), level: p.level}
if p.level == p.prev.level {
AddSibling(p.prev, node)
} else if p.level > p.prev.level {
AddChild(p.prev, node)
} else if p.level < p.prev.level {
for i := p.prev.level - p.level; i > 1; i-- {
p.prev = p.prev.Parent
}
AddSibling(p.prev.Parent, node)
}
case xml.ProcInst: // Processing Instruction
if p.prev.Type != DeclarationNode {
p.level++
}
node := &Node{Type: DeclarationNode, Data: tok.Target, level: p.level}
pairs := strings.Split(string(tok.Inst), " ")
for _, pair := range pairs {
pair = strings.TrimSpace(pair)
if i := strings.Index(pair, "="); i > 0 {
AddAttr(node, pair[:i], strings.Trim(pair[i+1:], `"'`))
}
}
if p.level == p.prev.level {
AddSibling(p.prev, node)
} else if p.level > p.prev.level {
AddChild(p.prev, node)
} else if p.level < p.prev.level {
for i := p.prev.level - p.level; i > 1; i-- {
p.prev = p.prev.Parent
}
AddSibling(p.prev.Parent, node)
}
p.prev = node
case xml.Directive:
node := &Node{Type: NotationNode, Data: string(tok), level: p.level}
if p.level == p.prev.level {
AddSibling(p.prev, node)
} else if p.level > p.prev.level {
AddChild(p.prev, node)
} else if p.level < p.prev.level {
for i := p.prev.level - p.level; i > 1; i-- {
p.prev = p.prev.Parent
}
AddSibling(p.prev.Parent, node)
}
}
}
}
// StreamParser enables loading and parsing an XML document in a streaming
// fashion.
type StreamParser struct {
p *parser
}
// CreateStreamParser creates a StreamParser. Argument streamElementXPath is
// required.
// Argument streamElementFilter is optional and should only be used in advanced
// scenarios.
//
// Scenario 1: simple case:
//
// xml := `<AAA><BBB>b1</BBB><BBB>b2</BBB></AAA>`
// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB")
// if err != nil {
// panic(err)
// }
// for {
// n, err := sp.Read()
// if err != nil {
// break
// }
// fmt.Println(n.OutputXML(true))
// }
//
// Output will be:
//
// <BBB>b1</BBB>
// <BBB>b2</BBB>
//
// Scenario 2: advanced case:
//
// xml := `<AAA><BBB>b1</BBB><BBB>b2</BBB></AAA>`
// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB", "/AAA/BBB[. != 'b1']")
// if err != nil {
// panic(err)
// }
// for {
// n, err := sp.Read()
// if err != nil {
// break
// }
// fmt.Println(n.OutputXML(true))
// }
//
// Output will be:
//
// <BBB>b2</BBB>
//
// As the argument names indicate, streamElementXPath should be used for
// providing xpath query pointing to the target element node only, no extra
// filtering on the element itself or its children; while streamElementFilter,
// if needed, can provide additional filtering on the target element and its
// children.
//
// CreateStreamParser returns an error if either streamElementXPath or
// streamElementFilter, if provided, cannot be successfully parsed and compiled
// into a valid xpath query.
func CreateStreamParser(r io.Reader, streamElementXPath string, streamElementFilter ...string) (*StreamParser, error) {
return CreateStreamParserWithOptions(r, ParserOptions{}, streamElementXPath, streamElementFilter...)
}
// CreateStreamParserWithOptions is like CreateStreamParser, but with custom options
func CreateStreamParserWithOptions(
r io.Reader,
options ParserOptions,
streamElementXPath string,
streamElementFilter ...string,
) (*StreamParser, error) {
elemXPath, err := getQuery(streamElementXPath)
if err != nil {
return nil, fmt.Errorf("invalid streamElementXPath '%s', err: %s", streamElementXPath, err.Error())
}
elemFilter := (*xpath.Expr)(nil)
if len(streamElementFilter) > 0 {
elemFilter, err = getQuery(streamElementFilter[0])
if err != nil {
return nil, fmt.Errorf("invalid streamElementFilter '%s', err: %s", streamElementFilter[0], err.Error())
}
}
parser := createParser(r)
options.apply(parser)
sp := &StreamParser{
p: parser,
}
sp.p.streamElementXPath = elemXPath
sp.p.streamElementFilter = elemFilter
return sp, nil
}
// Read returns a target node that satisfies the XPath specified by caller at
// StreamParser creation time. If there is no more satisfying target nodes after
// reading the rest of the XML document, io.EOF will be returned. At any time,
// any XML parsing error encountered will be returned, and the stream parsing
// stopped. Calling Read() after an error is returned (including io.EOF) results
// undefined behavior. Also note, due to the streaming nature, calling Read()
// will automatically remove any previous target node(s) from the document tree.
func (sp *StreamParser) Read() (*Node, error) {
// Because this is a streaming read, we need to release/remove last
// target node from the node tree to free up memory.
if sp.p.streamNode != nil {
// We need to remove all siblings before the current stream node,
// because the document may contain unwanted nodes between the target
// ones (for example new line text node), which would otherwise
// accumulate as first childs, and slow down the stream over time
for sp.p.streamNode.PrevSibling != nil {
RemoveFromTree(sp.p.streamNode.PrevSibling)
}
sp.p.prev = sp.p.streamNode.Parent
RemoveFromTree(sp.p.streamNode)
sp.p.streamNode = nil
sp.p.streamNodePrev = nil
}
return sp.p.parse()
}