feat: Inital commit
This commit is contained in:
+32
@@ -0,0 +1,32 @@
|
||||
# vscode
|
||||
.vscode
|
||||
debug
|
||||
*.test
|
||||
|
||||
./build
|
||||
|
||||
# Compiled Object files, Static and Dynamic libs (Shared Objects)
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
|
||||
|
||||
# Folders
|
||||
_obj
|
||||
_test
|
||||
|
||||
# Architecture specific extensions/prefixes
|
||||
*.[568vq]
|
||||
[568vq].out
|
||||
|
||||
*.cgo1.go
|
||||
*.cgo2.c
|
||||
_cgo_defun.c
|
||||
_cgo_gotypes.go
|
||||
_cgo_export.*
|
||||
|
||||
_testmain.go
|
||||
|
||||
*.exe
|
||||
*.test
|
||||
*.prof
|
||||
+17
@@ -0,0 +1,17 @@
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
+302
@@ -0,0 +1,302 @@
|
||||
# xmlquery
|
||||
|
||||
[](https://github.com/antchfx/xmlquery/actions/workflows/testing.yml)
|
||||
[](https://godoc.org/github.com/antchfx/xmlquery)
|
||||
[](https://goreportcard.com/report/github.com/antchfx/xmlquery)
|
||||
|
||||
# Overview
|
||||
|
||||
`xmlquery` is an XPath query package for XML documents, allowing you to extract
|
||||
data or evaluate from XML documents with an XPath expression.
|
||||
|
||||
`xmlquery` has a built-in query object caching feature that caches recently used
|
||||
XPATH query strings. Enabling caching can avoid recompile XPath expression for
|
||||
each query.
|
||||
|
||||
You can visit this page to learn about the supported XPath(1.0/2.0) syntax. https://github.com/antchfx/xpath
|
||||
|
||||
[htmlquery](https://github.com/antchfx/htmlquery) - Package for the HTML document query.
|
||||
|
||||
[xmlquery](https://github.com/antchfx/xmlquery) - Package for the XML document query.
|
||||
|
||||
[jsonquery](https://github.com/antchfx/jsonquery) - Package for the JSON document query.
|
||||
|
||||
# Installation
|
||||
|
||||
```
|
||||
$ go get github.com/antchfx/xmlquery
|
||||
```
|
||||
|
||||
# Quick Starts
|
||||
|
||||
```go
|
||||
import (
|
||||
"github.com/antchfx/xmlquery"
|
||||
)
|
||||
|
||||
func main(){
|
||||
s := `<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>W3Schools Home Page</title>
|
||||
<link>https://www.w3schools.com</link>
|
||||
<description>Free web building tutorials</description>
|
||||
<item>
|
||||
<title>RSS Tutorial</title>
|
||||
<link>https://www.w3schools.com/xml/xml_rss.asp</link>
|
||||
<description>New RSS tutorial on W3Schools</description>
|
||||
</item>
|
||||
<item>
|
||||
<title>XML Tutorial</title>
|
||||
<link>https://www.w3schools.com/xml</link>
|
||||
<description>New XML tutorial on W3Schools</description>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`
|
||||
|
||||
doc, err := xmlquery.Parse(strings.NewReader(s))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
channel := xmlquery.FindOne(doc, "//channel")
|
||||
if n := channel.SelectElement("title"); n != nil {
|
||||
fmt.Printf("title: %s\n", n.InnerText())
|
||||
}
|
||||
if n := channel.SelectElement("link"); n != nil {
|
||||
fmt.Printf("link: %s\n", n.InnerText())
|
||||
}
|
||||
for i, n := range xmlquery.Find(doc, "//item/title") {
|
||||
fmt.Printf("#%d %s\n", i, n.InnerText())
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
# Getting Started
|
||||
|
||||
### Find specified XPath query.
|
||||
|
||||
```go
|
||||
list, err := xmlquery.QueryAll(doc, "a")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
```
|
||||
|
||||
#### Parse an XML from URL.
|
||||
|
||||
```go
|
||||
doc, err := xmlquery.LoadURL("http://www.example.com/sitemap.xml")
|
||||
```
|
||||
|
||||
#### Parse an XML from string.
|
||||
|
||||
```go
|
||||
s := `<?xml version="1.0" encoding="utf-8"?><rss version="2.0"></rss>`
|
||||
doc, err := xmlquery.Parse(strings.NewReader(s))
|
||||
```
|
||||
|
||||
#### Parse an XML from io.Reader.
|
||||
|
||||
```go
|
||||
f, err := os.Open("../books.xml")
|
||||
doc, err := xmlquery.Parse(f)
|
||||
```
|
||||
|
||||
#### Parse an XML in a stream fashion (simple case without elements filtering).
|
||||
|
||||
```go
|
||||
f, _ := os.Open("../books.xml")
|
||||
p, err := xmlquery.CreateStreamParser(f, "/bookstore/book")
|
||||
for {
|
||||
n, err := p.Read()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Println(n)
|
||||
}
|
||||
```
|
||||
|
||||
Notes: `CreateStreamParser()` used for saving memory if your had a large XML file to parse.
|
||||
|
||||
#### Parse an XML in a stream fashion (simple case advanced element filtering).
|
||||
|
||||
```go
|
||||
f, _ := os.Open("../books.xml")
|
||||
p, err := xmlquery.CreateStreamParser(f, "/bookstore/book", "/bookstore/book[price>=10]")
|
||||
for {
|
||||
n, err := p.Read()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Println(n)
|
||||
}
|
||||
```
|
||||
|
||||
#### Find authors of all books in the bookstore.
|
||||
|
||||
```go
|
||||
list := xmlquery.Find(doc, "//book//author")
|
||||
// or
|
||||
list := xmlquery.Find(doc, "//author")
|
||||
```
|
||||
|
||||
#### Find the second book.
|
||||
|
||||
```go
|
||||
book := xmlquery.FindOne(doc, "//book[2]")
|
||||
```
|
||||
|
||||
#### Find the last book.
|
||||
|
||||
```go
|
||||
book := xmlquery.FindOne(doc, "//book[last()]")
|
||||
```
|
||||
|
||||
#### Find all book elements and only get `id` attribute.
|
||||
|
||||
```go
|
||||
list := xmlquery.Find(doc,"//book/@id")
|
||||
fmt.Println(list[0].InnerText) // outout @id value
|
||||
```
|
||||
|
||||
#### Find all books with id `bk104`.
|
||||
|
||||
```go
|
||||
list := xmlquery.Find(doc, "//book[@id='bk104']")
|
||||
```
|
||||
|
||||
#### Find all books with price less than 5.
|
||||
|
||||
```go
|
||||
list := xmlquery.Find(doc, "//book[price<5]")
|
||||
```
|
||||
|
||||
#### Evaluate total price of all books.
|
||||
|
||||
```go
|
||||
expr, err := xpath.Compile("sum(//book/price)")
|
||||
price := expr.Evaluate(xmlquery.CreateXPathNavigator(doc)).(float64)
|
||||
fmt.Printf("total price: %f\n", price)
|
||||
```
|
||||
|
||||
#### Count the number of books.
|
||||
|
||||
```go
|
||||
expr, err := xpath.Compile("count(//book)")
|
||||
count := expr.Evaluate(xmlquery.CreateXPathNavigator(doc)).(float64)
|
||||
```
|
||||
|
||||
#### Calculate the total price of all book prices.
|
||||
|
||||
```go
|
||||
expr, err := xpath.Compile("sum(//book/price)")
|
||||
price := expr.Evaluate(xmlquery.CreateXPathNavigator(doc)).(float64)
|
||||
```
|
||||
|
||||
# Advanced Features
|
||||
|
||||
### Parse `UTF-16` XML file with `ParseWithOptions()`.
|
||||
|
||||
```go
|
||||
f, _ := os.Open(`UTF-16.XML`)
|
||||
// Convert UTF-16 XML to UTF-8
|
||||
utf16ToUtf8Transformer := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewDecoder()
|
||||
utf8Reader := transform.NewReader(f, utf16ToUtf8Transformer)
|
||||
// Sets `CharsetReader`
|
||||
options := xmlquery.ParserOptions{
|
||||
Decoder: &xmlquery.DecoderOptions{
|
||||
CharsetReader: func(charset string, input io.Reader) (io.Reader, error) {
|
||||
return input, nil
|
||||
},
|
||||
},
|
||||
}
|
||||
doc, err := xmlquery.ParseWithOptions(utf8Reader, options)
|
||||
```
|
||||
|
||||
### Query with custom namespace prefix.
|
||||
|
||||
```go
|
||||
s := `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<pd:ProcessDefinition xmlns:pd="http://xmlns.xyz.com/process/2003" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
|
||||
<pd:activity name="Invoke Request-Response Service">
|
||||
<pd:type>RequestReplyActivity</pd:type>
|
||||
<pd:resourceType>OpClientReqActivity</pd:resourceType>
|
||||
<pd:x>300</pd:x>
|
||||
<pd:y>80</pd:y>
|
||||
</pd:activity>
|
||||
</pd:ProcessDefinition>`
|
||||
nsMap := map[string]string{
|
||||
"q": "http://xmlns.xyz.com/process/2003",
|
||||
"r": "http://www.w3.org/1999/XSL/Transform",
|
||||
"s": "http://www.w3.org/2001/XMLSchema",
|
||||
}
|
||||
expr, _ := xpath.CompileWithNS("//q:activity", nsMap)
|
||||
node := xmlquery.QuerySelector(doc, expr)
|
||||
```
|
||||
|
||||
#### Create XML document without call `xml.Marshal`.
|
||||
|
||||
```go
|
||||
doc := &xmlquery.Node{
|
||||
Type: xmlquery.DeclarationNode,
|
||||
Data: "xml",
|
||||
Attr: []xml.Attr{
|
||||
xml.Attr{Name: xml.Name{Local: "version"}, Value: "1.0"},
|
||||
},
|
||||
}
|
||||
root := &xmlquery.Node{
|
||||
Data: "rss",
|
||||
Type: xmlquery.ElementNode,
|
||||
}
|
||||
doc.FirstChild = root
|
||||
channel := &xmlquery.Node{
|
||||
Data: "channel",
|
||||
Type: xmlquery.ElementNode,
|
||||
}
|
||||
root.FirstChild = channel
|
||||
title := &xmlquery.Node{
|
||||
Data: "title",
|
||||
Type: xmlquery.ElementNode,
|
||||
}
|
||||
title_text := &xmlquery.Node{
|
||||
Data: "W3Schools Home Page",
|
||||
Type: xmlquery.TextNode,
|
||||
}
|
||||
title.FirstChild = title_text
|
||||
channel.FirstChild = title
|
||||
|
||||
fmt.Println(doc.OutputXML(true))
|
||||
fmt.Println(doc.OutputXMLWithOptions(WithOutputSelf()))
|
||||
```
|
||||
|
||||
Output:
|
||||
|
||||
```xml
|
||||
<?xml version="1.0"?><rss><channel><title>W3Schools Home Page</title></channel></rss>
|
||||
```
|
||||
|
||||
# FAQ
|
||||
|
||||
#### `Find()` vs `QueryAll()`, which is better?
|
||||
|
||||
`Find` and `QueryAll` both do the same thing: searches all of matched XML nodes.
|
||||
`Find` panics if provided with an invalid XPath query, while `QueryAll` returns
|
||||
an error.
|
||||
|
||||
#### Can I save my query expression object for the next query?
|
||||
|
||||
Yes, you can. We provide `QuerySelector` and `QuerySelectorAll` methods; they
|
||||
accept your query expression object.
|
||||
|
||||
Caching a query expression object avoids recompiling the XPath query
|
||||
expression, improving query performance.
|
||||
|
||||
# Questions
|
||||
|
||||
Please let me know if you have any questions
|
||||
+43
@@ -0,0 +1,43 @@
|
||||
package xmlquery
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/golang/groupcache/lru"
|
||||
|
||||
"github.com/antchfx/xpath"
|
||||
)
|
||||
|
||||
// DisableSelectorCache will disable caching for the query selector if value is true.
|
||||
var DisableSelectorCache = false
|
||||
|
||||
// SelectorCacheMaxEntries allows how many selector object can be caching. Default is 50.
|
||||
// Will disable caching if SelectorCacheMaxEntries <= 0.
|
||||
var SelectorCacheMaxEntries = 50
|
||||
|
||||
var (
|
||||
cacheOnce sync.Once
|
||||
cache *lru.Cache
|
||||
cacheMutex sync.Mutex
|
||||
)
|
||||
|
||||
func getQuery(expr string) (*xpath.Expr, error) {
|
||||
if DisableSelectorCache || SelectorCacheMaxEntries <= 0 {
|
||||
return xpath.Compile(expr)
|
||||
}
|
||||
cacheOnce.Do(func() {
|
||||
cache = lru.New(SelectorCacheMaxEntries)
|
||||
})
|
||||
cacheMutex.Lock()
|
||||
defer cacheMutex.Unlock()
|
||||
if v, ok := cache.Get(expr); ok {
|
||||
return v.(*xpath.Expr), nil
|
||||
}
|
||||
v, err := xpath.Compile(expr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cache.Add(expr, v)
|
||||
return v, nil
|
||||
|
||||
}
|
||||
+79
@@ -0,0 +1,79 @@
|
||||
package xmlquery
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
)
|
||||
|
||||
type cachedReader struct {
|
||||
buffer *bufio.Reader
|
||||
cache []byte
|
||||
caching bool
|
||||
}
|
||||
|
||||
func newCachedReader(r *bufio.Reader) *cachedReader {
|
||||
return &cachedReader{
|
||||
buffer: r,
|
||||
cache: make([]byte, 0, 4096),
|
||||
caching: false,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *cachedReader) StartCaching() {
|
||||
c.cache = c.cache[:0]
|
||||
c.caching = true
|
||||
}
|
||||
|
||||
func (c *cachedReader) ReadByte() (b byte, err error) {
|
||||
b, err = c.buffer.ReadByte()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if c.caching {
|
||||
c.cacheByte(b)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (c *cachedReader) Cache() []byte {
|
||||
return c.cache
|
||||
}
|
||||
|
||||
func (c *cachedReader) CacheWithLimit(n int) []byte {
|
||||
if n < 1 {
|
||||
return nil
|
||||
}
|
||||
l := len(c.cache)
|
||||
if n > l {
|
||||
n = l
|
||||
}
|
||||
return c.cache[:n]
|
||||
}
|
||||
|
||||
func (c *cachedReader) StopCaching() {
|
||||
c.caching = false
|
||||
}
|
||||
|
||||
func (c *cachedReader) Read(p []byte) (int, error) {
|
||||
n, err := c.buffer.Read(p)
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
if c.caching {
|
||||
for i := 0; i < n; i++ {
|
||||
if !c.cacheByte(p[i]) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
func (c *cachedReader) cacheByte(b byte) bool {
|
||||
n := len(c.cache)
|
||||
if n == cap(c.cache) {
|
||||
return false
|
||||
}
|
||||
c.cache = c.cache[:n+1]
|
||||
c.cache[n] = b
|
||||
return true
|
||||
}
|
||||
+477
@@ -0,0 +1,477 @@
|
||||
package xmlquery
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"io"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// A NodeType is the type of a Node.
|
||||
type NodeType uint
|
||||
|
||||
const (
|
||||
// DocumentNode is a document object that, as the root of the document tree,
|
||||
// provides access to the entire XML document.
|
||||
DocumentNode NodeType = iota
|
||||
// DeclarationNode is the document type declaration, indicated by the
|
||||
// following tag (for example, <!DOCTYPE...> ).
|
||||
DeclarationNode
|
||||
// ElementNode is an element (for example, <item> ).
|
||||
ElementNode
|
||||
// TextNode is the text content of a node.
|
||||
TextNode
|
||||
// CharDataNode node <![CDATA[content]]>
|
||||
CharDataNode
|
||||
// CommentNode a comment (for example, <!-- my comment --> ).
|
||||
CommentNode
|
||||
// AttributeNode is an attribute of element.
|
||||
AttributeNode
|
||||
// NotationNode is a directive represents in document (for example, <!text...>).
|
||||
NotationNode
|
||||
)
|
||||
|
||||
type Attr struct {
|
||||
Name xml.Name
|
||||
Value string
|
||||
NamespaceURI string
|
||||
}
|
||||
|
||||
// A Node consists of a NodeType and some Data (tag name for
|
||||
// element nodes, content for text) and are part of a tree of Nodes.
|
||||
type Node struct {
|
||||
Parent, FirstChild, LastChild, PrevSibling, NextSibling *Node
|
||||
|
||||
Type NodeType
|
||||
Data string
|
||||
Prefix string
|
||||
NamespaceURI string
|
||||
Attr []Attr
|
||||
|
||||
level int // node level in the tree
|
||||
}
|
||||
|
||||
type outputConfiguration struct {
|
||||
printSelf bool
|
||||
preserveSpaces bool
|
||||
emptyElementTagSupport bool
|
||||
skipComments bool
|
||||
useIndentation string
|
||||
}
|
||||
|
||||
type OutputOption func(*outputConfiguration)
|
||||
|
||||
// WithOutputSelf configures the Node to print the root node itself
|
||||
func WithOutputSelf() OutputOption {
|
||||
return func(oc *outputConfiguration) {
|
||||
oc.printSelf = true
|
||||
}
|
||||
}
|
||||
|
||||
// WithEmptyTagSupport empty tags should be written as <empty/> and
|
||||
// not as <empty></empty>
|
||||
func WithEmptyTagSupport() OutputOption {
|
||||
return func(oc *outputConfiguration) {
|
||||
oc.emptyElementTagSupport = true
|
||||
}
|
||||
}
|
||||
|
||||
// WithoutComments will skip comments in output
|
||||
func WithoutComments() OutputOption {
|
||||
return func(oc *outputConfiguration) {
|
||||
oc.skipComments = true
|
||||
}
|
||||
}
|
||||
|
||||
// WithPreserveSpace will preserve spaces in output
|
||||
func WithPreserveSpace() OutputOption {
|
||||
return func(oc *outputConfiguration) {
|
||||
oc.preserveSpaces = true
|
||||
}
|
||||
}
|
||||
|
||||
// WithoutPreserveSpace will not preserve spaces in output
|
||||
func WithoutPreserveSpace() OutputOption {
|
||||
return func(oc *outputConfiguration) {
|
||||
oc.preserveSpaces = false
|
||||
}
|
||||
}
|
||||
|
||||
// WithIndentation sets the indentation string used for formatting the output.
|
||||
func WithIndentation(indentation string) OutputOption {
|
||||
return func(oc *outputConfiguration) {
|
||||
oc.useIndentation = indentation
|
||||
}
|
||||
}
|
||||
|
||||
func newXMLName(name string) xml.Name {
|
||||
if i := strings.IndexByte(name, ':'); i > 0 {
|
||||
return xml.Name{
|
||||
Space: name[:i],
|
||||
Local: name[i+1:],
|
||||
}
|
||||
}
|
||||
return xml.Name{
|
||||
Local: name,
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Node) Level() int {
|
||||
return n.level
|
||||
}
|
||||
|
||||
// InnerText returns the text between the start and end tags of the object.
|
||||
func (n *Node) InnerText() string {
|
||||
var output func(*strings.Builder, *Node)
|
||||
output = func(b *strings.Builder, n *Node) {
|
||||
switch n.Type {
|
||||
case TextNode, CharDataNode:
|
||||
b.WriteString(n.Data)
|
||||
case CommentNode:
|
||||
default:
|
||||
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
||||
output(b, child)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
output(&b, n)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func (n *Node) sanitizedData(preserveSpaces bool) string {
|
||||
if preserveSpaces {
|
||||
return n.Data
|
||||
}
|
||||
return strings.TrimSpace(n.Data)
|
||||
}
|
||||
|
||||
func calculatePreserveSpaces(n *Node, pastValue bool) bool {
|
||||
if attr := n.SelectAttr("xml:space"); attr == "preserve" {
|
||||
return true
|
||||
} else if attr == "default" {
|
||||
return false
|
||||
}
|
||||
return pastValue
|
||||
}
|
||||
|
||||
type indentation struct {
|
||||
level int
|
||||
hasChild bool
|
||||
indent string
|
||||
w io.Writer
|
||||
}
|
||||
|
||||
func newIndentation(indent string, w io.Writer) *indentation {
|
||||
if indent == "" {
|
||||
return nil
|
||||
}
|
||||
return &indentation{
|
||||
indent: indent,
|
||||
w: w,
|
||||
}
|
||||
}
|
||||
|
||||
func (i *indentation) NewLine() (err error) {
|
||||
if i == nil {
|
||||
return
|
||||
}
|
||||
_, err = io.WriteString(i.w, "\n")
|
||||
return
|
||||
}
|
||||
|
||||
func (i *indentation) Open() (err error) {
|
||||
if i == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if err = i.writeIndent(); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
i.level++
|
||||
i.hasChild = false
|
||||
return
|
||||
}
|
||||
|
||||
func (i *indentation) Close() (err error) {
|
||||
if i == nil {
|
||||
return
|
||||
}
|
||||
i.level--
|
||||
if i.hasChild {
|
||||
if err = i.writeIndent(); err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
i.hasChild = true
|
||||
return
|
||||
}
|
||||
|
||||
func (i *indentation) writeIndent() (err error) {
|
||||
_, err = io.WriteString(i.w, "\n")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
_, err = io.WriteString(i.w, strings.Repeat(i.indent, i.level))
|
||||
return
|
||||
}
|
||||
|
||||
func outputXML(w io.Writer, n *Node, preserveSpaces bool, config *outputConfiguration, indent *indentation) (err error) {
|
||||
preserveSpaces = calculatePreserveSpaces(n, preserveSpaces)
|
||||
switch n.Type {
|
||||
case TextNode:
|
||||
_, err = io.WriteString(w, html.EscapeString(n.sanitizedData(preserveSpaces)))
|
||||
return
|
||||
case CharDataNode:
|
||||
_, err = fmt.Fprintf(w, "<![CDATA[%v]]>", n.Data)
|
||||
return
|
||||
case CommentNode:
|
||||
if !config.skipComments {
|
||||
_, err = fmt.Fprintf(w, "<!--%v-->", n.Data)
|
||||
}
|
||||
return
|
||||
case NotationNode:
|
||||
if err = indent.NewLine(); err != nil {
|
||||
return
|
||||
}
|
||||
_, err = fmt.Fprintf(w, "<!%s>", n.Data)
|
||||
return
|
||||
case DeclarationNode:
|
||||
_, err = io.WriteString(w, "<?"+n.Data)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
default:
|
||||
if err = indent.Open(); err != nil {
|
||||
return
|
||||
}
|
||||
if n.Prefix == "" {
|
||||
_, err = io.WriteString(w, "<"+n.Data)
|
||||
} else {
|
||||
_, err = fmt.Fprintf(w, "<%s:%s", n.Prefix, n.Data)
|
||||
}
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
for _, attr := range n.Attr {
|
||||
if attr.Name.Space != "" {
|
||||
_, err = fmt.Fprintf(w, ` %s:%s=`, attr.Name.Space, attr.Name.Local)
|
||||
} else {
|
||||
_, err = fmt.Fprintf(w, ` %s=`, attr.Name.Local)
|
||||
}
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
_, err = fmt.Fprintf(w, `"%v"`, html.EscapeString(attr.Value))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
if n.Type == DeclarationNode {
|
||||
_, err = io.WriteString(w, "?>")
|
||||
} else {
|
||||
if n.FirstChild != nil || !config.emptyElementTagSupport {
|
||||
_, err = io.WriteString(w, ">")
|
||||
} else {
|
||||
_, err = io.WriteString(w, "/>")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
err = indent.Close()
|
||||
return
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
||||
err = outputXML(w, child, preserveSpaces, config, indent)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
if n.Type != DeclarationNode {
|
||||
if err = indent.Close(); err != nil {
|
||||
return
|
||||
}
|
||||
if n.Prefix == "" {
|
||||
_, err = fmt.Fprintf(w, "</%s>", n.Data)
|
||||
} else {
|
||||
_, err = fmt.Fprintf(w, "</%s:%s>", n.Prefix, n.Data)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// OutputXML returns the text that including tags name.
|
||||
func (n *Node) OutputXML(self bool) string {
|
||||
if self {
|
||||
return n.OutputXMLWithOptions(WithOutputSelf())
|
||||
}
|
||||
return n.OutputXMLWithOptions()
|
||||
}
|
||||
|
||||
// OutputXMLWithOptions returns the text that including tags name.
|
||||
func (n *Node) OutputXMLWithOptions(opts ...OutputOption) string {
|
||||
var b strings.Builder
|
||||
n.WriteWithOptions(&b, opts...)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// Write writes xml to given writer.
|
||||
func (n *Node) Write(writer io.Writer, self bool) error {
|
||||
if self {
|
||||
return n.WriteWithOptions(writer, WithOutputSelf())
|
||||
}
|
||||
return n.WriteWithOptions(writer)
|
||||
}
|
||||
|
||||
// WriteWithOptions writes xml with given options to given writer.
|
||||
func (n *Node) WriteWithOptions(writer io.Writer, opts ...OutputOption) (err error) {
|
||||
config := &outputConfiguration{
|
||||
preserveSpaces: true,
|
||||
}
|
||||
// Set the options
|
||||
for _, opt := range opts {
|
||||
opt(config)
|
||||
}
|
||||
pastPreserveSpaces := config.preserveSpaces
|
||||
preserveSpaces := calculatePreserveSpaces(n, pastPreserveSpaces)
|
||||
b := bufio.NewWriter(writer)
|
||||
defer b.Flush()
|
||||
|
||||
ident := newIndentation(config.useIndentation, b)
|
||||
if config.printSelf && n.Type != DocumentNode {
|
||||
err = outputXML(b, n, preserveSpaces, config, ident)
|
||||
} else {
|
||||
for n := n.FirstChild; n != nil; n = n.NextSibling {
|
||||
err = outputXML(b, n, preserveSpaces, config, ident)
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// AddAttr adds a new attribute specified by 'key' and 'val' to a node 'n'.
|
||||
func AddAttr(n *Node, key, val string) {
|
||||
attr := Attr{
|
||||
Name: newXMLName(key),
|
||||
Value: val,
|
||||
}
|
||||
n.Attr = append(n.Attr, attr)
|
||||
}
|
||||
|
||||
// SetAttr allows an attribute value with the specified name to be changed.
|
||||
// If the attribute did not previously exist, it will be created.
|
||||
func (n *Node) SetAttr(key, value string) {
|
||||
name := newXMLName(key)
|
||||
for i, attr := range n.Attr {
|
||||
if attr.Name == name {
|
||||
n.Attr[i].Value = value
|
||||
return
|
||||
}
|
||||
}
|
||||
AddAttr(n, key, value)
|
||||
}
|
||||
|
||||
// RemoveAttr removes the attribute with the specified name.
|
||||
func (n *Node) RemoveAttr(key string) {
|
||||
name := newXMLName(key)
|
||||
for i, attr := range n.Attr {
|
||||
if attr.Name == name {
|
||||
n.Attr = append(n.Attr[:i], n.Attr[i+1:]...)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// AddChild adds a new node 'n' to a node 'parent' as its last child.
|
||||
func AddChild(parent, n *Node) {
|
||||
n.Parent = parent
|
||||
n.NextSibling = nil
|
||||
if parent.FirstChild == nil {
|
||||
parent.FirstChild = n
|
||||
n.PrevSibling = nil
|
||||
} else {
|
||||
parent.LastChild.NextSibling = n
|
||||
n.PrevSibling = parent.LastChild
|
||||
}
|
||||
|
||||
parent.LastChild = n
|
||||
}
|
||||
|
||||
// AddSibling adds a new node 'n' as a last node of sibling chain for a given node 'sibling'.
|
||||
func AddSibling(sibling, n *Node) {
|
||||
for t := sibling.NextSibling; t != nil; t = t.NextSibling {
|
||||
sibling = t
|
||||
}
|
||||
n.Parent = sibling.Parent
|
||||
sibling.NextSibling = n
|
||||
n.PrevSibling = sibling
|
||||
n.NextSibling = nil
|
||||
if sibling.Parent != nil {
|
||||
sibling.Parent.LastChild = n
|
||||
}
|
||||
}
|
||||
|
||||
// AddImmediateSibling adds a new node 'n' as immediate sibling a given node 'sibling'.
|
||||
func AddImmediateSibling(sibling, n *Node) {
|
||||
n.Parent = sibling.Parent
|
||||
n.NextSibling = sibling.NextSibling
|
||||
sibling.NextSibling = n
|
||||
n.PrevSibling = sibling
|
||||
if n.NextSibling != nil {
|
||||
n.NextSibling.PrevSibling = n
|
||||
} else if n.Parent != nil {
|
||||
sibling.Parent.LastChild = n
|
||||
}
|
||||
}
|
||||
|
||||
// RemoveFromTree removes a node and its subtree from the document
|
||||
// tree it is in. If the node is the root of the tree, then it's no-op.
|
||||
func RemoveFromTree(n *Node) {
|
||||
if n.Parent == nil {
|
||||
return
|
||||
}
|
||||
if n.Parent.FirstChild == n {
|
||||
if n.Parent.LastChild == n {
|
||||
n.Parent.FirstChild = nil
|
||||
n.Parent.LastChild = nil
|
||||
} else {
|
||||
n.Parent.FirstChild = n.NextSibling
|
||||
n.NextSibling.PrevSibling = nil
|
||||
}
|
||||
} else {
|
||||
if n.Parent.LastChild == n {
|
||||
n.Parent.LastChild = n.PrevSibling
|
||||
n.PrevSibling.NextSibling = nil
|
||||
} else {
|
||||
n.PrevSibling.NextSibling = n.NextSibling
|
||||
n.NextSibling.PrevSibling = n.PrevSibling
|
||||
}
|
||||
}
|
||||
n.Parent = nil
|
||||
n.PrevSibling = nil
|
||||
n.NextSibling = nil
|
||||
}
|
||||
|
||||
// GetRoot returns a root of the tree where 'n' is a node.
|
||||
func GetRoot(n *Node) *Node {
|
||||
if n == nil {
|
||||
return nil
|
||||
}
|
||||
root := n
|
||||
for root.Parent != nil {
|
||||
root = root.Parent
|
||||
}
|
||||
return root
|
||||
}
|
||||
+33
@@ -0,0 +1,33 @@
|
||||
package xmlquery
|
||||
|
||||
import (
|
||||
"encoding/xml"
|
||||
"io"
|
||||
)
|
||||
|
||||
type ParserOptions struct {
|
||||
Decoder *DecoderOptions
|
||||
}
|
||||
|
||||
func (options ParserOptions) apply(parser *parser) {
|
||||
if options.Decoder != nil {
|
||||
(*options.Decoder).apply(parser.decoder)
|
||||
}
|
||||
}
|
||||
|
||||
// DecoderOptions implement the very same options than the standard
|
||||
// encoding/xml package. Please refer to this documentation:
|
||||
// https://golang.org/pkg/encoding/xml/#Decoder
|
||||
type DecoderOptions struct {
|
||||
Strict bool
|
||||
AutoClose []string
|
||||
Entity map[string]string
|
||||
CharsetReader func(charset string, input io.Reader) (io.Reader, error)
|
||||
}
|
||||
|
||||
func (options DecoderOptions) apply(decoder *xml.Decoder) {
|
||||
decoder.Strict = options.Strict
|
||||
decoder.AutoClose = options.AutoClose
|
||||
decoder.Entity = options.Entity
|
||||
decoder.CharsetReader = options.CharsetReader
|
||||
}
|
||||
+430
@@ -0,0 +1,430 @@
|
||||
package xmlquery
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/antchfx/xpath"
|
||||
"golang.org/x/net/html/charset"
|
||||
)
|
||||
|
||||
var xmlMIMERegex = regexp.MustCompile(`(?i)((application|image|message|model)/((\w|\.|-)+\+?)?|text/)(wb)?xml`)
|
||||
|
||||
// LoadURL loads the XML document from the specified URL.
|
||||
func LoadURL(url string) (*Node, error) {
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
// Make sure the Content-Type has a valid XML MIME type
|
||||
if xmlMIMERegex.MatchString(resp.Header.Get("Content-Type")) {
|
||||
return Parse(resp.Body)
|
||||
}
|
||||
return nil, fmt.Errorf("invalid XML document(%s)", resp.Header.Get("Content-Type"))
|
||||
}
|
||||
|
||||
// Parse returns the parse tree for the XML from the given Reader.
|
||||
func Parse(r io.Reader) (*Node, error) {
|
||||
return ParseWithOptions(r, ParserOptions{})
|
||||
}
|
||||
|
||||
// ParseWithOptions is like parse, but with custom options
|
||||
func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
|
||||
p := createParser(r)
|
||||
options.apply(p)
|
||||
var err error
|
||||
for err == nil {
|
||||
_, err = p.parse()
|
||||
}
|
||||
|
||||
if err == io.EOF {
|
||||
// additional check for validity
|
||||
// according to: https://www.w3.org/TR/xml
|
||||
// the document MUST contain at least ONE element
|
||||
valid := false
|
||||
for doc := p.doc; doc != nil; doc = doc.NextSibling {
|
||||
for node := doc.FirstChild; node != nil; node = node.NextSibling {
|
||||
if node.Type == ElementNode {
|
||||
valid = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if !valid {
|
||||
return nil, fmt.Errorf("xmlquery: invalid XML document")
|
||||
}
|
||||
return p.doc, nil
|
||||
}
|
||||
|
||||
return nil, err
|
||||
}
|
||||
|
||||
type parser struct {
|
||||
decoder *xml.Decoder
|
||||
doc *Node
|
||||
level int
|
||||
prev *Node
|
||||
streamElementXPath *xpath.Expr // Under streaming mode, this specifies the xpath to the target element node(s).
|
||||
streamElementFilter *xpath.Expr // If specified, it provides further filtering on the target element.
|
||||
streamNode *Node // Need to remember the last target node So we can clean it up upon next Read() call.
|
||||
streamNodePrev *Node // Need to remember target node's prev so upon target node removal, we can restore correct prev.
|
||||
reader *cachedReader // Need to maintain a reference to the reader, so we can determine whether a node contains CDATA.
|
||||
once sync.Once
|
||||
space2prefix map[string]*xmlnsPrefix
|
||||
}
|
||||
|
||||
type xmlnsPrefix struct {
|
||||
name string
|
||||
level int
|
||||
}
|
||||
|
||||
func createParser(r io.Reader) *parser {
|
||||
reader := newCachedReader(bufio.NewReader(r))
|
||||
p := &parser{
|
||||
decoder: xml.NewDecoder(reader),
|
||||
doc: &Node{Type: DocumentNode},
|
||||
level: 0,
|
||||
reader: reader,
|
||||
}
|
||||
if p.decoder.CharsetReader == nil {
|
||||
p.decoder.CharsetReader = charset.NewReaderLabel
|
||||
}
|
||||
p.prev = p.doc
|
||||
return p
|
||||
}
|
||||
|
||||
func (p *parser) parse() (*Node, error) {
|
||||
p.once.Do(func() {
|
||||
p.space2prefix = map[string]*xmlnsPrefix{"http://www.w3.org/XML/1998/namespace": {name: "xml", level: 0}}
|
||||
})
|
||||
|
||||
var streamElementNodeCounter int
|
||||
for {
|
||||
p.reader.StartCaching()
|
||||
tok, err := p.decoder.Token()
|
||||
p.reader.StopCaching()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch tok := tok.(type) {
|
||||
case xml.StartElement:
|
||||
if p.level == 0 {
|
||||
// mising XML declaration
|
||||
attributes := make([]Attr, 1)
|
||||
attributes[0].Name = xml.Name{Local: "version"}
|
||||
attributes[0].Value = "1.0"
|
||||
node := &Node{
|
||||
Type: DeclarationNode,
|
||||
Data: "xml",
|
||||
Attr: attributes,
|
||||
level: 1,
|
||||
}
|
||||
AddChild(p.prev, node)
|
||||
p.level = 1
|
||||
p.prev = node
|
||||
}
|
||||
|
||||
for _, att := range tok.Attr {
|
||||
if att.Name.Local == "xmlns" {
|
||||
// https://github.com/antchfx/xmlquery/issues/67
|
||||
if prefix, ok := p.space2prefix[att.Value]; !ok || (ok && prefix.level >= p.level) {
|
||||
p.space2prefix[att.Value] = &xmlnsPrefix{name: "", level: p.level} // reset empty if exist the default namespace
|
||||
}
|
||||
} else if att.Name.Space == "xmlns" {
|
||||
// maybe there are have duplicate NamespaceURL?
|
||||
p.space2prefix[att.Value] = &xmlnsPrefix{name: att.Name.Local, level: p.level}
|
||||
}
|
||||
}
|
||||
|
||||
if space := tok.Name.Space; space != "" {
|
||||
if _, found := p.space2prefix[space]; !found && p.decoder.Strict {
|
||||
return nil, fmt.Errorf("xmlquery: invalid XML document, namespace %s is missing", space)
|
||||
}
|
||||
}
|
||||
|
||||
attributes := make([]Attr, len(tok.Attr))
|
||||
for i, att := range tok.Attr {
|
||||
name := att.Name
|
||||
if prefix, ok := p.space2prefix[name.Space]; ok {
|
||||
name.Space = prefix.name
|
||||
}
|
||||
attributes[i] = Attr{
|
||||
Name: name,
|
||||
Value: att.Value,
|
||||
NamespaceURI: att.Name.Space,
|
||||
}
|
||||
}
|
||||
|
||||
node := &Node{
|
||||
Type: ElementNode,
|
||||
Data: tok.Name.Local,
|
||||
NamespaceURI: tok.Name.Space,
|
||||
Attr: attributes,
|
||||
level: p.level,
|
||||
}
|
||||
|
||||
if p.level == p.prev.level {
|
||||
AddSibling(p.prev, node)
|
||||
} else if p.level > p.prev.level {
|
||||
AddChild(p.prev, node)
|
||||
} else if p.level < p.prev.level {
|
||||
for i := p.prev.level - p.level; i > 1; i-- {
|
||||
p.prev = p.prev.Parent
|
||||
}
|
||||
AddSibling(p.prev.Parent, node)
|
||||
}
|
||||
|
||||
if node.NamespaceURI != "" {
|
||||
if v, ok := p.space2prefix[node.NamespaceURI]; ok {
|
||||
cached := string(p.reader.CacheWithLimit(len(v.name) + len(node.Data) + 2))
|
||||
if strings.HasPrefix(cached, fmt.Sprintf("%s:%s", v.name, node.Data)) || strings.HasPrefix(cached, fmt.Sprintf("<%s:%s", v.name, node.Data)) {
|
||||
node.Prefix = v.name
|
||||
}
|
||||
}
|
||||
}
|
||||
// If we're in the streaming mode, we need to remember the node if it is the target node
|
||||
// so that when we finish processing the node's EndElement, we know how/what to return to
|
||||
// caller. Also we need to remove the target node from the tree upon next Read() call so
|
||||
// memory doesn't grow unbounded.
|
||||
if p.streamElementXPath != nil {
|
||||
if p.streamNode == nil {
|
||||
if QuerySelector(p.doc, p.streamElementXPath) != nil {
|
||||
p.streamNode = node
|
||||
p.streamNodePrev = p.prev
|
||||
streamElementNodeCounter = 1
|
||||
}
|
||||
} else {
|
||||
streamElementNodeCounter++
|
||||
}
|
||||
}
|
||||
p.prev = node
|
||||
p.level++
|
||||
case xml.EndElement:
|
||||
p.level--
|
||||
// If we're in streaming mode, and we already have a potential streaming
|
||||
// target node identified (p.streamNode != nil) then we need to check if
|
||||
// this is the real one we want to return to caller.
|
||||
if p.streamNode != nil {
|
||||
streamElementNodeCounter--
|
||||
if streamElementNodeCounter == 0 {
|
||||
// Now we know this element node is the at least passing the initial
|
||||
// p.streamElementXPath check and is a potential target node candidate.
|
||||
// We need to have 1 more check with p.streamElementFilter (if given) to
|
||||
// ensure it is really the element node we want.
|
||||
// The reason we need a two-step check process is because the following
|
||||
// situation:
|
||||
// <AAA><BBB>b1</BBB></AAA>
|
||||
// And say the p.streamElementXPath = "/AAA/BBB[. != 'b1']". Now during
|
||||
// xml.StartElement time, the <BBB> node is still empty, so it will pass
|
||||
// the p.streamElementXPath check. However, eventually we know this <BBB>
|
||||
// shouldn't be returned to the caller. Having a second more fine-grained
|
||||
// filter check ensures that. So in this case, the caller should really
|
||||
// setup the stream parser with:
|
||||
// streamElementXPath = "/AAA/BBB["
|
||||
// streamElementFilter = "/AAA/BBB[. != 'b1']"
|
||||
if p.streamElementFilter == nil || QuerySelector(p.doc, p.streamElementFilter) != nil {
|
||||
return p.streamNode, nil
|
||||
}
|
||||
// otherwise, this isn't our target node, clean things up.
|
||||
// note we also remove the underlying *Node from the node tree, to prevent
|
||||
// future stream node candidate selection error.
|
||||
RemoveFromTree(p.streamNode)
|
||||
p.prev = p.streamNodePrev
|
||||
p.streamNode = nil
|
||||
p.streamNodePrev = nil
|
||||
}
|
||||
}
|
||||
case xml.CharData:
|
||||
// First, normalize the cache...
|
||||
cached := bytes.ToUpper(p.reader.CacheWithLimit(9))
|
||||
nodeType := TextNode
|
||||
if bytes.HasPrefix(cached, []byte("<![CDATA[")) || bytes.HasPrefix(cached, []byte("![CDATA[")) {
|
||||
nodeType = CharDataNode
|
||||
}
|
||||
node := &Node{Type: nodeType, Data: string(tok), level: p.level}
|
||||
if p.level == p.prev.level {
|
||||
AddSibling(p.prev, node)
|
||||
} else if p.level > p.prev.level {
|
||||
AddChild(p.prev, node)
|
||||
} else if p.level < p.prev.level {
|
||||
for i := p.prev.level - p.level; i > 1; i-- {
|
||||
p.prev = p.prev.Parent
|
||||
}
|
||||
AddSibling(p.prev.Parent, node)
|
||||
}
|
||||
case xml.Comment:
|
||||
node := &Node{Type: CommentNode, Data: string(tok), level: p.level}
|
||||
if p.level == p.prev.level {
|
||||
AddSibling(p.prev, node)
|
||||
} else if p.level > p.prev.level {
|
||||
AddChild(p.prev, node)
|
||||
} else if p.level < p.prev.level {
|
||||
for i := p.prev.level - p.level; i > 1; i-- {
|
||||
p.prev = p.prev.Parent
|
||||
}
|
||||
AddSibling(p.prev.Parent, node)
|
||||
}
|
||||
case xml.ProcInst: // Processing Instruction
|
||||
if p.prev.Type != DeclarationNode {
|
||||
p.level++
|
||||
}
|
||||
node := &Node{Type: DeclarationNode, Data: tok.Target, level: p.level}
|
||||
pairs := strings.Split(string(tok.Inst), " ")
|
||||
for _, pair := range pairs {
|
||||
pair = strings.TrimSpace(pair)
|
||||
if i := strings.Index(pair, "="); i > 0 {
|
||||
AddAttr(node, pair[:i], strings.Trim(pair[i+1:], `"'`))
|
||||
}
|
||||
}
|
||||
if p.level == p.prev.level {
|
||||
AddSibling(p.prev, node)
|
||||
} else if p.level > p.prev.level {
|
||||
AddChild(p.prev, node)
|
||||
} else if p.level < p.prev.level {
|
||||
for i := p.prev.level - p.level; i > 1; i-- {
|
||||
p.prev = p.prev.Parent
|
||||
}
|
||||
AddSibling(p.prev.Parent, node)
|
||||
}
|
||||
p.prev = node
|
||||
case xml.Directive:
|
||||
node := &Node{Type: NotationNode, Data: string(tok), level: p.level}
|
||||
if p.level == p.prev.level {
|
||||
AddSibling(p.prev, node)
|
||||
} else if p.level > p.prev.level {
|
||||
AddChild(p.prev, node)
|
||||
} else if p.level < p.prev.level {
|
||||
for i := p.prev.level - p.level; i > 1; i-- {
|
||||
p.prev = p.prev.Parent
|
||||
}
|
||||
AddSibling(p.prev.Parent, node)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// StreamParser enables loading and parsing an XML document in a streaming
|
||||
// fashion.
|
||||
type StreamParser struct {
|
||||
p *parser
|
||||
}
|
||||
|
||||
// CreateStreamParser creates a StreamParser. Argument streamElementXPath is
|
||||
// required.
|
||||
// Argument streamElementFilter is optional and should only be used in advanced
|
||||
// scenarios.
|
||||
//
|
||||
// Scenario 1: simple case:
|
||||
//
|
||||
// xml := `<AAA><BBB>b1</BBB><BBB>b2</BBB></AAA>`
|
||||
// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB")
|
||||
// if err != nil {
|
||||
// panic(err)
|
||||
// }
|
||||
// for {
|
||||
// n, err := sp.Read()
|
||||
// if err != nil {
|
||||
// break
|
||||
// }
|
||||
// fmt.Println(n.OutputXML(true))
|
||||
// }
|
||||
//
|
||||
// Output will be:
|
||||
//
|
||||
// <BBB>b1</BBB>
|
||||
// <BBB>b2</BBB>
|
||||
//
|
||||
// Scenario 2: advanced case:
|
||||
//
|
||||
// xml := `<AAA><BBB>b1</BBB><BBB>b2</BBB></AAA>`
|
||||
// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB", "/AAA/BBB[. != 'b1']")
|
||||
// if err != nil {
|
||||
// panic(err)
|
||||
// }
|
||||
// for {
|
||||
// n, err := sp.Read()
|
||||
// if err != nil {
|
||||
// break
|
||||
// }
|
||||
// fmt.Println(n.OutputXML(true))
|
||||
// }
|
||||
//
|
||||
// Output will be:
|
||||
//
|
||||
// <BBB>b2</BBB>
|
||||
//
|
||||
// As the argument names indicate, streamElementXPath should be used for
|
||||
// providing xpath query pointing to the target element node only, no extra
|
||||
// filtering on the element itself or its children; while streamElementFilter,
|
||||
// if needed, can provide additional filtering on the target element and its
|
||||
// children.
|
||||
//
|
||||
// CreateStreamParser returns an error if either streamElementXPath or
|
||||
// streamElementFilter, if provided, cannot be successfully parsed and compiled
|
||||
// into a valid xpath query.
|
||||
func CreateStreamParser(r io.Reader, streamElementXPath string, streamElementFilter ...string) (*StreamParser, error) {
|
||||
return CreateStreamParserWithOptions(r, ParserOptions{}, streamElementXPath, streamElementFilter...)
|
||||
}
|
||||
|
||||
// CreateStreamParserWithOptions is like CreateStreamParser, but with custom options
|
||||
func CreateStreamParserWithOptions(
|
||||
r io.Reader,
|
||||
options ParserOptions,
|
||||
streamElementXPath string,
|
||||
streamElementFilter ...string,
|
||||
) (*StreamParser, error) {
|
||||
elemXPath, err := getQuery(streamElementXPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid streamElementXPath '%s', err: %s", streamElementXPath, err.Error())
|
||||
}
|
||||
elemFilter := (*xpath.Expr)(nil)
|
||||
if len(streamElementFilter) > 0 {
|
||||
elemFilter, err = getQuery(streamElementFilter[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid streamElementFilter '%s', err: %s", streamElementFilter[0], err.Error())
|
||||
}
|
||||
}
|
||||
parser := createParser(r)
|
||||
options.apply(parser)
|
||||
sp := &StreamParser{
|
||||
p: parser,
|
||||
}
|
||||
sp.p.streamElementXPath = elemXPath
|
||||
sp.p.streamElementFilter = elemFilter
|
||||
return sp, nil
|
||||
}
|
||||
|
||||
// Read returns a target node that satisfies the XPath specified by caller at
|
||||
// StreamParser creation time. If there is no more satisfying target nodes after
|
||||
// reading the rest of the XML document, io.EOF will be returned. At any time,
|
||||
// any XML parsing error encountered will be returned, and the stream parsing
|
||||
// stopped. Calling Read() after an error is returned (including io.EOF) results
|
||||
// undefined behavior. Also note, due to the streaming nature, calling Read()
|
||||
// will automatically remove any previous target node(s) from the document tree.
|
||||
func (sp *StreamParser) Read() (*Node, error) {
|
||||
// Because this is a streaming read, we need to release/remove last
|
||||
// target node from the node tree to free up memory.
|
||||
if sp.p.streamNode != nil {
|
||||
// We need to remove all siblings before the current stream node,
|
||||
// because the document may contain unwanted nodes between the target
|
||||
// ones (for example new line text node), which would otherwise
|
||||
// accumulate as first childs, and slow down the stream over time
|
||||
for sp.p.streamNode.PrevSibling != nil {
|
||||
RemoveFromTree(sp.p.streamNode.PrevSibling)
|
||||
}
|
||||
sp.p.prev = sp.p.streamNode.Parent
|
||||
RemoveFromTree(sp.p.streamNode)
|
||||
sp.p.streamNode = nil
|
||||
sp.p.streamNodePrev = nil
|
||||
}
|
||||
return sp.p.parse()
|
||||
}
|
||||
+304
@@ -0,0 +1,304 @@
|
||||
/*
|
||||
Package xmlquery provides extract data from XML documents using XPath expression.
|
||||
*/
|
||||
package xmlquery
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/antchfx/xpath"
|
||||
)
|
||||
|
||||
// SelectElements finds child elements with the specified name.
|
||||
func (n *Node) SelectElements(name string) []*Node {
|
||||
return Find(n, name)
|
||||
}
|
||||
|
||||
// SelectElement finds child elements with the specified name.
|
||||
func (n *Node) SelectElement(name string) *Node {
|
||||
return FindOne(n, name)
|
||||
}
|
||||
|
||||
// SelectAttr returns the attribute value with the specified name.
|
||||
func (n *Node) SelectAttr(name string) string {
|
||||
if n.Type == AttributeNode {
|
||||
if n.Data == name {
|
||||
return n.InnerText()
|
||||
}
|
||||
return ""
|
||||
}
|
||||
xmlName := newXMLName(name)
|
||||
for _, attr := range n.Attr {
|
||||
if attr.Name == xmlName {
|
||||
return attr.Value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
var _ xpath.NodeNavigator = &NodeNavigator{}
|
||||
|
||||
// CreateXPathNavigator creates a new xpath.NodeNavigator for the specified
|
||||
// XML Node.
|
||||
func CreateXPathNavigator(top *Node) *NodeNavigator {
|
||||
return &NodeNavigator{curr: top, root: top, attr: -1}
|
||||
}
|
||||
|
||||
func getCurrentNode(it *xpath.NodeIterator) *Node {
|
||||
n := it.Current().(*NodeNavigator)
|
||||
if n.NodeType() == xpath.AttributeNode {
|
||||
childNode := &Node{
|
||||
Type: TextNode,
|
||||
Data: n.Value(),
|
||||
}
|
||||
return &Node{
|
||||
Parent: n.curr,
|
||||
Type: AttributeNode,
|
||||
Data: n.LocalName(),
|
||||
FirstChild: childNode,
|
||||
LastChild: childNode,
|
||||
}
|
||||
}
|
||||
return n.curr
|
||||
}
|
||||
|
||||
// Find is like QueryAll but panics if `expr` is not a valid XPath expression.
|
||||
// See `QueryAll()` function.
|
||||
func Find(top *Node, expr string) []*Node {
|
||||
nodes, err := QueryAll(top, expr)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return nodes
|
||||
}
|
||||
|
||||
// FindOne is like Query but panics if `expr` is not a valid XPath expression.
|
||||
// See `Query()` function.
|
||||
func FindOne(top *Node, expr string) *Node {
|
||||
node, err := Query(top, expr)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return node
|
||||
}
|
||||
|
||||
// QueryAll searches the XML Node that matches by the specified XPath expr.
|
||||
// Returns an error if the expression `expr` cannot be parsed.
|
||||
func QueryAll(top *Node, expr string) ([]*Node, error) {
|
||||
exp, err := getQuery(expr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return QuerySelectorAll(top, exp), nil
|
||||
}
|
||||
|
||||
// Query searches the XML Node that matches by the specified XPath expr,
|
||||
// and returns first matched element.
|
||||
func Query(top *Node, expr string) (*Node, error) {
|
||||
exp, err := getQuery(expr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return QuerySelector(top, exp), nil
|
||||
}
|
||||
|
||||
// QuerySelectorAll searches all of the XML Node that matches the specified
|
||||
// XPath selectors.
|
||||
func QuerySelectorAll(top *Node, selector *xpath.Expr) []*Node {
|
||||
t := selector.Select(CreateXPathNavigator(top))
|
||||
var elems []*Node
|
||||
for t.MoveNext() {
|
||||
elems = append(elems, getCurrentNode(t))
|
||||
}
|
||||
return elems
|
||||
}
|
||||
|
||||
// QuerySelector returns the first matched XML Node by the specified XPath
|
||||
// selector.
|
||||
func QuerySelector(top *Node, selector *xpath.Expr) *Node {
|
||||
t := selector.Select(CreateXPathNavigator(top))
|
||||
if t.MoveNext() {
|
||||
return getCurrentNode(t)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// FindEach searches the html.Node and calls functions cb.
|
||||
// Important: this method is deprecated, instead, use for .. = range Find(){}.
|
||||
func FindEach(top *Node, expr string, cb func(int, *Node)) {
|
||||
for i, n := range Find(top, expr) {
|
||||
cb(i, n)
|
||||
}
|
||||
}
|
||||
|
||||
// FindEachWithBreak functions the same as FindEach but allows to break the loop
|
||||
// by returning false from the callback function `cb`.
|
||||
// Important: this method is deprecated, instead, use .. = range Find(){}.
|
||||
func FindEachWithBreak(top *Node, expr string, cb func(int, *Node) bool) {
|
||||
for i, n := range Find(top, expr) {
|
||||
if !cb(i, n) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type NodeNavigator struct {
|
||||
root, curr *Node
|
||||
attr int
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) Current() *Node {
|
||||
return x.curr
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) NodeType() xpath.NodeType {
|
||||
switch x.curr.Type {
|
||||
case CommentNode:
|
||||
return xpath.CommentNode
|
||||
case TextNode, CharDataNode, NotationNode:
|
||||
return xpath.TextNode
|
||||
case DeclarationNode, DocumentNode:
|
||||
return xpath.RootNode
|
||||
case ElementNode:
|
||||
if x.attr != -1 {
|
||||
return xpath.AttributeNode
|
||||
}
|
||||
return xpath.ElementNode
|
||||
}
|
||||
panic(fmt.Sprintf("unknown XML node type: %v", x.curr.Type))
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) LocalName() string {
|
||||
if x.attr != -1 {
|
||||
return x.curr.Attr[x.attr].Name.Local
|
||||
}
|
||||
return x.curr.Data
|
||||
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) Prefix() string {
|
||||
if x.NodeType() == xpath.AttributeNode {
|
||||
if x.attr != -1 {
|
||||
return x.curr.Attr[x.attr].Name.Space
|
||||
}
|
||||
return ""
|
||||
}
|
||||
return x.curr.Prefix
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) NamespaceURL() string {
|
||||
if x.attr != -1 {
|
||||
return x.curr.Attr[x.attr].NamespaceURI
|
||||
}
|
||||
return x.curr.NamespaceURI
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) Value() string {
|
||||
switch x.curr.Type {
|
||||
case CommentNode:
|
||||
return x.curr.Data
|
||||
case ElementNode:
|
||||
if x.attr != -1 {
|
||||
return x.curr.Attr[x.attr].Value
|
||||
}
|
||||
return x.curr.InnerText()
|
||||
case TextNode:
|
||||
return x.curr.Data
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) Copy() xpath.NodeNavigator {
|
||||
n := *x
|
||||
return &n
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) MoveToRoot() {
|
||||
x.curr = x.root
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) MoveToParent() bool {
|
||||
if x.attr != -1 {
|
||||
x.attr = -1
|
||||
return true
|
||||
} else if node := x.curr.Parent; node != nil {
|
||||
x.curr = node
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) MoveToNextAttribute() bool {
|
||||
if x.attr >= len(x.curr.Attr)-1 {
|
||||
return false
|
||||
}
|
||||
x.attr++
|
||||
return true
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) MoveToChild() bool {
|
||||
if x.attr != -1 {
|
||||
return false
|
||||
}
|
||||
if node := x.curr.FirstChild; node != nil {
|
||||
x.curr = node
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) MoveToFirst() bool {
|
||||
if x.attr != -1 || x.curr.PrevSibling == nil {
|
||||
return false
|
||||
}
|
||||
for {
|
||||
node := x.curr.PrevSibling
|
||||
if node == nil {
|
||||
break
|
||||
}
|
||||
x.curr = node
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) String() string {
|
||||
return x.Value()
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) MoveToNext() bool {
|
||||
if x.attr != -1 {
|
||||
return false
|
||||
}
|
||||
for node := x.curr.NextSibling; node != nil; node = x.curr.NextSibling {
|
||||
x.curr = node
|
||||
if x.curr.Type != TextNode || strings.TrimSpace(x.curr.Data) != "" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) MoveToPrevious() bool {
|
||||
if x.attr != -1 {
|
||||
return false
|
||||
}
|
||||
for node := x.curr.PrevSibling; node != nil; node = x.curr.PrevSibling {
|
||||
x.curr = node
|
||||
if x.curr.Type != TextNode || strings.TrimSpace(x.curr.Data) != "" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (x *NodeNavigator) MoveTo(other xpath.NodeNavigator) bool {
|
||||
node, ok := other.(*NodeNavigator)
|
||||
if !ok || node.root != x.root {
|
||||
return false
|
||||
}
|
||||
|
||||
x.curr = node.curr
|
||||
x.attr = node.attr
|
||||
return true
|
||||
}
|
||||
Reference in New Issue
Block a user