feat: Inital commit

This commit is contained in:
2025-07-26 05:58:59 +00:00
commit 753d1c60ea
1849 changed files with 830533 additions and 0 deletions
+32
View File
@@ -0,0 +1,32 @@
# vscode
.vscode
debug
*.test
./build
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof
+17
View File
@@ -0,0 +1,17 @@
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
+156
View File
@@ -0,0 +1,156 @@
# htmlquery
[![Build Status](https://github.com/antchfx/htmlquery/actions/workflows/testing.yml/badge.svg)](https://github.com/antchfx/htmlquery/actions/workflows/testing.yml)
[![GoDoc](https://godoc.org/github.com/antchfx/htmlquery?status.svg)](https://godoc.org/github.com/antchfx/htmlquery)
[![Go Report Card](https://goreportcard.com/badge/github.com/antchfx/htmlquery)](https://goreportcard.com/report/github.com/antchfx/htmlquery)
# Overview
`htmlquery` is an XPath query package for HTML, lets you extract data or evaluate from HTML documents by an XPath expression.
`htmlquery` built-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. Enable query caching can avoid re-compile XPath expression each query.
You can visit this page to learn about the supported XPath(1.0/2.0) syntax. https://github.com/antchfx/xpath
# XPath query packages for Go
| Name | Description |
| ------------------------------------------------- | ----------------------------------------- |
| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document |
| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
# Installation
```
go get github.com/antchfx/htmlquery
```
# Getting Started
#### Query, returns matched elements or error.
```go
nodes, err := htmlquery.QueryAll(doc, "//a")
if err != nil {
panic(`not a valid XPath expression.`)
}
```
#### Load HTML document from URL.
```go
doc, err := htmlquery.LoadURL("http://example.com/")
```
#### Load HTML from document.
```go
filePath := "/home/user/sample.html"
doc, err := htmlquery.LoadDoc(filePath)
```
#### Load HTML document from string.
```go
s := `<html>....</html>`
doc, err := htmlquery.Parse(strings.NewReader(s))
```
#### Find all A elements.
```go
list := htmlquery.Find(doc, "//a")
```
#### Find all A elements that have `href` attribute.
```go
list := htmlquery.Find(doc, "//a[@href]")
```
#### Find all A elements with `href` attribute and only return `href` value.
```go
list := htmlquery.Find(doc, "//a/@href")
for _ , n := range list{
fmt.Println(htmlquery.InnerText(n)) // output @href value
}
```
### Find the third A element.
```go
a := htmlquery.FindOne(doc, "//a[3]")
```
### Find children element (img) under A `href` and print the source
```go
a := htmlquery.FindOne(doc, "//a")
img := htmlquery.FindOne(a, "//img")
fmt.Prinln(htmlquery.SelectAttr(img, "src")) // output @src value
```
#### Evaluate the number of all IMG element.
```go
expr, _ := xpath.Compile("count(//img)")
v := expr.Evaluate(htmlquery.CreateXPathNavigator(doc)).(float64)
fmt.Printf("total count is %f", v)
```
# Quick Starts
```go
func main() {
doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
if err != nil {
panic(err)
}
// Find all news item.
list, err := htmlquery.QueryAll(doc, "//ol/li")
if err != nil {
panic(err)
}
for i, n := range list {
a := htmlquery.FindOne(n, "//a")
if a != nil {
fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
}
}
}
```
# FAQ
#### `Find()` vs `QueryAll()`, which is better?
`Find` and `QueryAll` both do the same things, searches all of matched html nodes.
The `Find` will panics if you give an error XPath query, but `QueryAll` will return an error for you.
#### Can I save my query expression object for the next query?
Yes, you can. We offer the `QuerySelector` and `QuerySelectorAll` methods, It will accept your query expression object.
Cache a query expression object(or reused) will avoid re-compile XPath query expression, improve your query performance.
#### XPath query object cache performance
```
goos: windows
goarch: amd64
pkg: github.com/antchfx/htmlquery
BenchmarkSelectorCache-4 20000000 55.2 ns/op
BenchmarkDisableSelectorCache-4 500000 3162 ns/op
```
#### How to disable caching?
```
htmlquery.DisableSelectorCache = true
```
# Questions
Please let me know if you have any questions.
+42
View File
@@ -0,0 +1,42 @@
package htmlquery
import (
"sync"
"github.com/antchfx/xpath"
"github.com/golang/groupcache/lru"
)
// DisableSelectorCache will disable caching for the query selector if value is true.
var DisableSelectorCache = false
// SelectorCacheMaxEntries allows how many selector object can be caching. Default is 50.
// Will disable caching if SelectorCacheMaxEntries <= 0.
var SelectorCacheMaxEntries = 50
var (
cacheOnce sync.Once
cache *lru.Cache
cacheMutex sync.Mutex
)
func getQuery(expr string) (*xpath.Expr, error) {
if DisableSelectorCache || SelectorCacheMaxEntries <= 0 {
return xpath.Compile(expr)
}
cacheOnce.Do(func() {
cache = lru.New(SelectorCacheMaxEntries)
})
cacheMutex.Lock()
defer cacheMutex.Unlock()
if v, ok := cache.Get(expr); ok {
return v.(*xpath.Expr), nil
}
v, err := xpath.Compile(expr)
if err != nil {
return nil, err
}
cache.Add(expr, v)
return v, nil
}
+377
View File
@@ -0,0 +1,377 @@
/*
Package htmlquery provides extract data from HTML documents using XPath expression.
*/
package htmlquery
import (
"bufio"
"compress/gzip"
"compress/zlib"
"fmt"
"io"
"net/http"
"os"
"strings"
"github.com/antchfx/xpath"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
)
var _ xpath.NodeNavigator = &NodeNavigator{}
// CreateXPathNavigator creates a new xpath.NodeNavigator for the specified html.Node.
func CreateXPathNavigator(top *html.Node) *NodeNavigator {
return &NodeNavigator{curr: top, root: top, attr: -1}
}
// Find is like QueryAll but Will panics if the expression `expr` cannot be parsed.
//
// See `QueryAll()` function.
func Find(top *html.Node, expr string) []*html.Node {
nodes, err := QueryAll(top, expr)
if err != nil {
panic(err)
}
return nodes
}
// FindOne is like Query but will panics if the expression `expr` cannot be parsed.
// See `Query()` function.
func FindOne(top *html.Node, expr string) *html.Node {
node, err := Query(top, expr)
if err != nil {
panic(err)
}
return node
}
// QueryAll searches the html.Node that matches by the specified XPath expr.
// Return an error if the expression `expr` cannot be parsed.
func QueryAll(top *html.Node, expr string) ([]*html.Node, error) {
exp, err := getQuery(expr)
if err != nil {
return nil, err
}
nodes := QuerySelectorAll(top, exp)
return nodes, nil
}
// Query runs the given XPath expression against the given html.Node and
// returns the first matching html.Node, or nil if no matches are found.
//
// Returns an error if the expression `expr` cannot be parsed.
func Query(top *html.Node, expr string) (*html.Node, error) {
exp, err := getQuery(expr)
if err != nil {
return nil, err
}
return QuerySelector(top, exp), nil
}
// QuerySelector returns the first matched html.Node by the specified XPath selector.
func QuerySelector(top *html.Node, selector *xpath.Expr) *html.Node {
t := selector.Select(CreateXPathNavigator(top))
if t.MoveNext() {
return getCurrentNode(t.Current().(*NodeNavigator))
}
return nil
}
// QuerySelectorAll searches all of the html.Node that matches the specified XPath selectors.
func QuerySelectorAll(top *html.Node, selector *xpath.Expr) []*html.Node {
var elems []*html.Node
t := selector.Select(CreateXPathNavigator(top))
for t.MoveNext() {
nav := t.Current().(*NodeNavigator)
n := getCurrentNode(nav)
elems = append(elems, n)
}
return elems
}
// LoadURL loads the HTML document from the specified URL. Default enabling gzip on a HTTP request.
func LoadURL(url string) (*html.Node, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
// Enable gzip compression.
req.Header.Add("Accept-Encoding", "gzip")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
var reader io.ReadCloser
defer func() {
if reader != nil {
reader.Close()
}
}()
encoding := resp.Header.Get("Content-Encoding")
switch encoding {
case "gzip":
reader, err = gzip.NewReader(resp.Body)
if err != nil {
return nil, err
}
case "deflate":
reader, err = zlib.NewReader(resp.Body)
if err != nil {
return nil, err
}
case "":
reader = resp.Body
default:
return nil, fmt.Errorf("%s compression is not support", encoding)
}
r, err := charset.NewReader(reader, resp.Header.Get("Content-Type"))
if err != nil {
return nil, err
}
return html.Parse(r)
}
// LoadDoc loads the HTML document from the specified file path.
func LoadDoc(path string) (*html.Node, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
return html.Parse(bufio.NewReader(f))
}
func getCurrentNode(n *NodeNavigator) *html.Node {
if n.NodeType() == xpath.AttributeNode {
childNode := &html.Node{
Type: html.TextNode,
Data: n.Value(),
}
return &html.Node{
Type: html.ElementNode,
Data: n.LocalName(),
FirstChild: childNode,
LastChild: childNode,
}
}
return n.curr
}
// Parse returns the parse tree for the HTML from the given Reader.
func Parse(r io.Reader) (*html.Node, error) {
return html.Parse(r)
}
// InnerText returns the text between the start and end tags of the object.
func InnerText(n *html.Node) string {
var output func(*strings.Builder, *html.Node)
output = func(b *strings.Builder, n *html.Node) {
switch n.Type {
case html.TextNode:
b.WriteString(n.Data)
return
case html.CommentNode:
return
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
output(b, child)
}
}
var b strings.Builder
output(&b, n)
return b.String()
}
// SelectAttr returns the attribute value with the specified name.
func SelectAttr(n *html.Node, name string) (val string) {
if n == nil {
return
}
if n.Type == html.ElementNode && n.Parent == nil && name == n.Data {
return InnerText(n)
}
for _, attr := range n.Attr {
if attr.Key == name {
val = attr.Val
break
}
}
return
}
// ExistsAttr returns whether attribute with specified name exists.
func ExistsAttr(n *html.Node, name string) bool {
if n == nil {
return false
}
for _, attr := range n.Attr {
if attr.Key == name {
return true
}
}
return false
}
// OutputHTML returns the text including tags name.
func OutputHTML(n *html.Node, self bool) string {
var b strings.Builder
if self {
html.Render(&b, n)
} else {
for n := n.FirstChild; n != nil; n = n.NextSibling {
html.Render(&b, n)
}
}
return b.String()
}
type NodeNavigator struct {
root, curr *html.Node
attr int
}
func (h *NodeNavigator) Current() *html.Node {
return h.curr
}
func (h *NodeNavigator) NodeType() xpath.NodeType {
switch h.curr.Type {
case html.CommentNode:
return xpath.CommentNode
case html.TextNode:
return xpath.TextNode
case html.DocumentNode:
return xpath.RootNode
case html.ElementNode:
if h.attr != -1 {
return xpath.AttributeNode
}
return xpath.ElementNode
case html.DoctypeNode:
// ignored <!DOCTYPE HTML> declare and as Root-Node type.
return xpath.RootNode
}
panic(fmt.Sprintf("unknown HTML node type: %v", h.curr.Type))
}
func (h *NodeNavigator) LocalName() string {
if h.attr != -1 {
return h.curr.Attr[h.attr].Key
}
return h.curr.Data
}
func (*NodeNavigator) Prefix() string {
return ""
}
func (h *NodeNavigator) Value() string {
switch h.curr.Type {
case html.CommentNode:
return h.curr.Data
case html.ElementNode:
if h.attr != -1 {
return h.curr.Attr[h.attr].Val
}
return InnerText(h.curr)
case html.TextNode:
return h.curr.Data
}
return ""
}
func (h *NodeNavigator) Copy() xpath.NodeNavigator {
n := *h
return &n
}
func (h *NodeNavigator) MoveToRoot() {
h.curr = h.root
}
func (h *NodeNavigator) MoveToParent() bool {
if h.attr != -1 {
h.attr = -1
return true
} else if node := h.curr.Parent; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveToNextAttribute() bool {
if h.attr >= len(h.curr.Attr)-1 {
return false
}
h.attr++
return true
}
func (h *NodeNavigator) MoveToChild() bool {
if h.attr != -1 {
return false
}
if node := h.curr.FirstChild; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveToFirst() bool {
if h.attr != -1 || h.curr.PrevSibling == nil {
return false
}
for {
node := h.curr.PrevSibling
if node == nil {
break
}
h.curr = node
}
return true
}
func (h *NodeNavigator) String() string {
return h.Value()
}
func (h *NodeNavigator) MoveToNext() bool {
if h.attr != -1 {
return false
}
if node := h.curr.NextSibling; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveToPrevious() bool {
if h.attr != -1 {
return false
}
if node := h.curr.PrevSibling; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveTo(other xpath.NodeNavigator) bool {
node, ok := other.(*NodeNavigator)
if !ok || node.root != h.root {
return false
}
h.curr = node.curr
h.attr = node.attr
return true
}
+32
View File
@@ -0,0 +1,32 @@
# vscode
.vscode
debug
*.test
./build
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof
+17
View File
@@ -0,0 +1,17 @@
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
+302
View File
@@ -0,0 +1,302 @@
# xmlquery
[![Build Status](https://github.com/antchfx/xmlquery/actions/workflows/testing.yml/badge.svg)](https://github.com/antchfx/xmlquery/actions/workflows/testing.yml)
[![GoDoc](https://godoc.org/github.com/antchfx/xmlquery?status.svg)](https://godoc.org/github.com/antchfx/xmlquery)
[![Go Report Card](https://goreportcard.com/badge/github.com/antchfx/xmlquery)](https://goreportcard.com/report/github.com/antchfx/xmlquery)
# Overview
`xmlquery` is an XPath query package for XML documents, allowing you to extract
data or evaluate from XML documents with an XPath expression.
`xmlquery` has a built-in query object caching feature that caches recently used
XPATH query strings. Enabling caching can avoid recompile XPath expression for
each query.
You can visit this page to learn about the supported XPath(1.0/2.0) syntax. https://github.com/antchfx/xpath
[htmlquery](https://github.com/antchfx/htmlquery) - Package for the HTML document query.
[xmlquery](https://github.com/antchfx/xmlquery) - Package for the XML document query.
[jsonquery](https://github.com/antchfx/jsonquery) - Package for the JSON document query.
# Installation
```
$ go get github.com/antchfx/xmlquery
```
# Quick Starts
```go
import (
"github.com/antchfx/xmlquery"
)
func main(){
s := `<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
<channel>
<title>W3Schools Home Page</title>
<link>https://www.w3schools.com</link>
<description>Free web building tutorials</description>
<item>
<title>RSS Tutorial</title>
<link>https://www.w3schools.com/xml/xml_rss.asp</link>
<description>New RSS tutorial on W3Schools</description>
</item>
<item>
<title>XML Tutorial</title>
<link>https://www.w3schools.com/xml</link>
<description>New XML tutorial on W3Schools</description>
</item>
</channel>
</rss>`
doc, err := xmlquery.Parse(strings.NewReader(s))
if err != nil {
panic(err)
}
channel := xmlquery.FindOne(doc, "//channel")
if n := channel.SelectElement("title"); n != nil {
fmt.Printf("title: %s\n", n.InnerText())
}
if n := channel.SelectElement("link"); n != nil {
fmt.Printf("link: %s\n", n.InnerText())
}
for i, n := range xmlquery.Find(doc, "//item/title") {
fmt.Printf("#%d %s\n", i, n.InnerText())
}
}
```
# Getting Started
### Find specified XPath query.
```go
list, err := xmlquery.QueryAll(doc, "a")
if err != nil {
panic(err)
}
```
#### Parse an XML from URL.
```go
doc, err := xmlquery.LoadURL("http://www.example.com/sitemap.xml")
```
#### Parse an XML from string.
```go
s := `<?xml version="1.0" encoding="utf-8"?><rss version="2.0"></rss>`
doc, err := xmlquery.Parse(strings.NewReader(s))
```
#### Parse an XML from io.Reader.
```go
f, err := os.Open("../books.xml")
doc, err := xmlquery.Parse(f)
```
#### Parse an XML in a stream fashion (simple case without elements filtering).
```go
f, _ := os.Open("../books.xml")
p, err := xmlquery.CreateStreamParser(f, "/bookstore/book")
for {
n, err := p.Read()
if err == io.EOF {
break
}
if err != nil {
panic(err)
}
fmt.Println(n)
}
```
Notes: `CreateStreamParser()` used for saving memory if your had a large XML file to parse.
#### Parse an XML in a stream fashion (simple case advanced element filtering).
```go
f, _ := os.Open("../books.xml")
p, err := xmlquery.CreateStreamParser(f, "/bookstore/book", "/bookstore/book[price>=10]")
for {
n, err := p.Read()
if err == io.EOF {
break
}
if err != nil {
panic(err)
}
fmt.Println(n)
}
```
#### Find authors of all books in the bookstore.
```go
list := xmlquery.Find(doc, "//book//author")
// or
list := xmlquery.Find(doc, "//author")
```
#### Find the second book.
```go
book := xmlquery.FindOne(doc, "//book[2]")
```
#### Find the last book.
```go
book := xmlquery.FindOne(doc, "//book[last()]")
```
#### Find all book elements and only get `id` attribute.
```go
list := xmlquery.Find(doc,"//book/@id")
fmt.Println(list[0].InnerText) // outout @id value
```
#### Find all books with id `bk104`.
```go
list := xmlquery.Find(doc, "//book[@id='bk104']")
```
#### Find all books with price less than 5.
```go
list := xmlquery.Find(doc, "//book[price<5]")
```
#### Evaluate total price of all books.
```go
expr, err := xpath.Compile("sum(//book/price)")
price := expr.Evaluate(xmlquery.CreateXPathNavigator(doc)).(float64)
fmt.Printf("total price: %f\n", price)
```
#### Count the number of books.
```go
expr, err := xpath.Compile("count(//book)")
count := expr.Evaluate(xmlquery.CreateXPathNavigator(doc)).(float64)
```
#### Calculate the total price of all book prices.
```go
expr, err := xpath.Compile("sum(//book/price)")
price := expr.Evaluate(xmlquery.CreateXPathNavigator(doc)).(float64)
```
# Advanced Features
### Parse `UTF-16` XML file with `ParseWithOptions()`.
```go
f, _ := os.Open(`UTF-16.XML`)
// Convert UTF-16 XML to UTF-8
utf16ToUtf8Transformer := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewDecoder()
utf8Reader := transform.NewReader(f, utf16ToUtf8Transformer)
// Sets `CharsetReader`
options := xmlquery.ParserOptions{
Decoder: &xmlquery.DecoderOptions{
CharsetReader: func(charset string, input io.Reader) (io.Reader, error) {
return input, nil
},
},
}
doc, err := xmlquery.ParseWithOptions(utf8Reader, options)
```
### Query with custom namespace prefix.
```go
s := `<?xml version="1.0" encoding="UTF-8"?>
<pd:ProcessDefinition xmlns:pd="http://xmlns.xyz.com/process/2003" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<pd:activity name="Invoke Request-Response Service">
<pd:type>RequestReplyActivity</pd:type>
<pd:resourceType>OpClientReqActivity</pd:resourceType>
<pd:x>300</pd:x>
<pd:y>80</pd:y>
</pd:activity>
</pd:ProcessDefinition>`
nsMap := map[string]string{
"q": "http://xmlns.xyz.com/process/2003",
"r": "http://www.w3.org/1999/XSL/Transform",
"s": "http://www.w3.org/2001/XMLSchema",
}
expr, _ := xpath.CompileWithNS("//q:activity", nsMap)
node := xmlquery.QuerySelector(doc, expr)
```
#### Create XML document without call `xml.Marshal`.
```go
doc := &xmlquery.Node{
Type: xmlquery.DeclarationNode,
Data: "xml",
Attr: []xml.Attr{
xml.Attr{Name: xml.Name{Local: "version"}, Value: "1.0"},
},
}
root := &xmlquery.Node{
Data: "rss",
Type: xmlquery.ElementNode,
}
doc.FirstChild = root
channel := &xmlquery.Node{
Data: "channel",
Type: xmlquery.ElementNode,
}
root.FirstChild = channel
title := &xmlquery.Node{
Data: "title",
Type: xmlquery.ElementNode,
}
title_text := &xmlquery.Node{
Data: "W3Schools Home Page",
Type: xmlquery.TextNode,
}
title.FirstChild = title_text
channel.FirstChild = title
fmt.Println(doc.OutputXML(true))
fmt.Println(doc.OutputXMLWithOptions(WithOutputSelf()))
```
Output:
```xml
<?xml version="1.0"?><rss><channel><title>W3Schools Home Page</title></channel></rss>
```
# FAQ
#### `Find()` vs `QueryAll()`, which is better?
`Find` and `QueryAll` both do the same thing: searches all of matched XML nodes.
`Find` panics if provided with an invalid XPath query, while `QueryAll` returns
an error.
#### Can I save my query expression object for the next query?
Yes, you can. We provide `QuerySelector` and `QuerySelectorAll` methods; they
accept your query expression object.
Caching a query expression object avoids recompiling the XPath query
expression, improving query performance.
# Questions
Please let me know if you have any questions
+43
View File
@@ -0,0 +1,43 @@
package xmlquery
import (
"sync"
"github.com/golang/groupcache/lru"
"github.com/antchfx/xpath"
)
// DisableSelectorCache will disable caching for the query selector if value is true.
var DisableSelectorCache = false
// SelectorCacheMaxEntries allows how many selector object can be caching. Default is 50.
// Will disable caching if SelectorCacheMaxEntries <= 0.
var SelectorCacheMaxEntries = 50
var (
cacheOnce sync.Once
cache *lru.Cache
cacheMutex sync.Mutex
)
func getQuery(expr string) (*xpath.Expr, error) {
if DisableSelectorCache || SelectorCacheMaxEntries <= 0 {
return xpath.Compile(expr)
}
cacheOnce.Do(func() {
cache = lru.New(SelectorCacheMaxEntries)
})
cacheMutex.Lock()
defer cacheMutex.Unlock()
if v, ok := cache.Get(expr); ok {
return v.(*xpath.Expr), nil
}
v, err := xpath.Compile(expr)
if err != nil {
return nil, err
}
cache.Add(expr, v)
return v, nil
}
+79
View File
@@ -0,0 +1,79 @@
package xmlquery
import (
"bufio"
)
type cachedReader struct {
buffer *bufio.Reader
cache []byte
caching bool
}
func newCachedReader(r *bufio.Reader) *cachedReader {
return &cachedReader{
buffer: r,
cache: make([]byte, 0, 4096),
caching: false,
}
}
func (c *cachedReader) StartCaching() {
c.cache = c.cache[:0]
c.caching = true
}
func (c *cachedReader) ReadByte() (b byte, err error) {
b, err = c.buffer.ReadByte()
if err != nil {
return
}
if c.caching {
c.cacheByte(b)
}
return
}
func (c *cachedReader) Cache() []byte {
return c.cache
}
func (c *cachedReader) CacheWithLimit(n int) []byte {
if n < 1 {
return nil
}
l := len(c.cache)
if n > l {
n = l
}
return c.cache[:n]
}
func (c *cachedReader) StopCaching() {
c.caching = false
}
func (c *cachedReader) Read(p []byte) (int, error) {
n, err := c.buffer.Read(p)
if err != nil {
return n, err
}
if c.caching {
for i := 0; i < n; i++ {
if !c.cacheByte(p[i]) {
break
}
}
}
return n, err
}
func (c *cachedReader) cacheByte(b byte) bool {
n := len(c.cache)
if n == cap(c.cache) {
return false
}
c.cache = c.cache[:n+1]
c.cache[n] = b
return true
}
+477
View File
@@ -0,0 +1,477 @@
package xmlquery
import (
"bufio"
"encoding/xml"
"fmt"
"html"
"io"
"strings"
)
// A NodeType is the type of a Node.
type NodeType uint
const (
// DocumentNode is a document object that, as the root of the document tree,
// provides access to the entire XML document.
DocumentNode NodeType = iota
// DeclarationNode is the document type declaration, indicated by the
// following tag (for example, <!DOCTYPE...> ).
DeclarationNode
// ElementNode is an element (for example, <item> ).
ElementNode
// TextNode is the text content of a node.
TextNode
// CharDataNode node <![CDATA[content]]>
CharDataNode
// CommentNode a comment (for example, <!-- my comment --> ).
CommentNode
// AttributeNode is an attribute of element.
AttributeNode
// NotationNode is a directive represents in document (for example, <!text...>).
NotationNode
)
type Attr struct {
Name xml.Name
Value string
NamespaceURI string
}
// A Node consists of a NodeType and some Data (tag name for
// element nodes, content for text) and are part of a tree of Nodes.
type Node struct {
Parent, FirstChild, LastChild, PrevSibling, NextSibling *Node
Type NodeType
Data string
Prefix string
NamespaceURI string
Attr []Attr
level int // node level in the tree
}
type outputConfiguration struct {
printSelf bool
preserveSpaces bool
emptyElementTagSupport bool
skipComments bool
useIndentation string
}
type OutputOption func(*outputConfiguration)
// WithOutputSelf configures the Node to print the root node itself
func WithOutputSelf() OutputOption {
return func(oc *outputConfiguration) {
oc.printSelf = true
}
}
// WithEmptyTagSupport empty tags should be written as <empty/> and
// not as <empty></empty>
func WithEmptyTagSupport() OutputOption {
return func(oc *outputConfiguration) {
oc.emptyElementTagSupport = true
}
}
// WithoutComments will skip comments in output
func WithoutComments() OutputOption {
return func(oc *outputConfiguration) {
oc.skipComments = true
}
}
// WithPreserveSpace will preserve spaces in output
func WithPreserveSpace() OutputOption {
return func(oc *outputConfiguration) {
oc.preserveSpaces = true
}
}
// WithoutPreserveSpace will not preserve spaces in output
func WithoutPreserveSpace() OutputOption {
return func(oc *outputConfiguration) {
oc.preserveSpaces = false
}
}
// WithIndentation sets the indentation string used for formatting the output.
func WithIndentation(indentation string) OutputOption {
return func(oc *outputConfiguration) {
oc.useIndentation = indentation
}
}
func newXMLName(name string) xml.Name {
if i := strings.IndexByte(name, ':'); i > 0 {
return xml.Name{
Space: name[:i],
Local: name[i+1:],
}
}
return xml.Name{
Local: name,
}
}
func (n *Node) Level() int {
return n.level
}
// InnerText returns the text between the start and end tags of the object.
func (n *Node) InnerText() string {
var output func(*strings.Builder, *Node)
output = func(b *strings.Builder, n *Node) {
switch n.Type {
case TextNode, CharDataNode:
b.WriteString(n.Data)
case CommentNode:
default:
for child := n.FirstChild; child != nil; child = child.NextSibling {
output(b, child)
}
}
}
var b strings.Builder
output(&b, n)
return b.String()
}
func (n *Node) sanitizedData(preserveSpaces bool) string {
if preserveSpaces {
return n.Data
}
return strings.TrimSpace(n.Data)
}
func calculatePreserveSpaces(n *Node, pastValue bool) bool {
if attr := n.SelectAttr("xml:space"); attr == "preserve" {
return true
} else if attr == "default" {
return false
}
return pastValue
}
type indentation struct {
level int
hasChild bool
indent string
w io.Writer
}
func newIndentation(indent string, w io.Writer) *indentation {
if indent == "" {
return nil
}
return &indentation{
indent: indent,
w: w,
}
}
func (i *indentation) NewLine() (err error) {
if i == nil {
return
}
_, err = io.WriteString(i.w, "\n")
return
}
func (i *indentation) Open() (err error) {
if i == nil {
return
}
if err = i.writeIndent(); err != nil {
return
}
i.level++
i.hasChild = false
return
}
func (i *indentation) Close() (err error) {
if i == nil {
return
}
i.level--
if i.hasChild {
if err = i.writeIndent(); err != nil {
return
}
}
i.hasChild = true
return
}
func (i *indentation) writeIndent() (err error) {
_, err = io.WriteString(i.w, "\n")
if err != nil {
return
}
_, err = io.WriteString(i.w, strings.Repeat(i.indent, i.level))
return
}
func outputXML(w io.Writer, n *Node, preserveSpaces bool, config *outputConfiguration, indent *indentation) (err error) {
preserveSpaces = calculatePreserveSpaces(n, preserveSpaces)
switch n.Type {
case TextNode:
_, err = io.WriteString(w, html.EscapeString(n.sanitizedData(preserveSpaces)))
return
case CharDataNode:
_, err = fmt.Fprintf(w, "<![CDATA[%v]]>", n.Data)
return
case CommentNode:
if !config.skipComments {
_, err = fmt.Fprintf(w, "<!--%v-->", n.Data)
}
return
case NotationNode:
if err = indent.NewLine(); err != nil {
return
}
_, err = fmt.Fprintf(w, "<!%s>", n.Data)
return
case DeclarationNode:
_, err = io.WriteString(w, "<?"+n.Data)
if err != nil {
return
}
default:
if err = indent.Open(); err != nil {
return
}
if n.Prefix == "" {
_, err = io.WriteString(w, "<"+n.Data)
} else {
_, err = fmt.Fprintf(w, "<%s:%s", n.Prefix, n.Data)
}
if err != nil {
return
}
}
for _, attr := range n.Attr {
if attr.Name.Space != "" {
_, err = fmt.Fprintf(w, ` %s:%s=`, attr.Name.Space, attr.Name.Local)
} else {
_, err = fmt.Fprintf(w, ` %s=`, attr.Name.Local)
}
if err != nil {
return
}
_, err = fmt.Fprintf(w, `"%v"`, html.EscapeString(attr.Value))
if err != nil {
return
}
}
if n.Type == DeclarationNode {
_, err = io.WriteString(w, "?>")
} else {
if n.FirstChild != nil || !config.emptyElementTagSupport {
_, err = io.WriteString(w, ">")
} else {
_, err = io.WriteString(w, "/>")
if err != nil {
return
}
err = indent.Close()
return
}
}
if err != nil {
return
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
err = outputXML(w, child, preserveSpaces, config, indent)
if err != nil {
return
}
}
if n.Type != DeclarationNode {
if err = indent.Close(); err != nil {
return
}
if n.Prefix == "" {
_, err = fmt.Fprintf(w, "</%s>", n.Data)
} else {
_, err = fmt.Fprintf(w, "</%s:%s>", n.Prefix, n.Data)
}
}
return
}
// OutputXML returns the text that including tags name.
func (n *Node) OutputXML(self bool) string {
if self {
return n.OutputXMLWithOptions(WithOutputSelf())
}
return n.OutputXMLWithOptions()
}
// OutputXMLWithOptions returns the text that including tags name.
func (n *Node) OutputXMLWithOptions(opts ...OutputOption) string {
var b strings.Builder
n.WriteWithOptions(&b, opts...)
return b.String()
}
// Write writes xml to given writer.
func (n *Node) Write(writer io.Writer, self bool) error {
if self {
return n.WriteWithOptions(writer, WithOutputSelf())
}
return n.WriteWithOptions(writer)
}
// WriteWithOptions writes xml with given options to given writer.
func (n *Node) WriteWithOptions(writer io.Writer, opts ...OutputOption) (err error) {
config := &outputConfiguration{
preserveSpaces: true,
}
// Set the options
for _, opt := range opts {
opt(config)
}
pastPreserveSpaces := config.preserveSpaces
preserveSpaces := calculatePreserveSpaces(n, pastPreserveSpaces)
b := bufio.NewWriter(writer)
defer b.Flush()
ident := newIndentation(config.useIndentation, b)
if config.printSelf && n.Type != DocumentNode {
err = outputXML(b, n, preserveSpaces, config, ident)
} else {
for n := n.FirstChild; n != nil; n = n.NextSibling {
err = outputXML(b, n, preserveSpaces, config, ident)
if err != nil {
break
}
}
}
return
}
// AddAttr adds a new attribute specified by 'key' and 'val' to a node 'n'.
func AddAttr(n *Node, key, val string) {
attr := Attr{
Name: newXMLName(key),
Value: val,
}
n.Attr = append(n.Attr, attr)
}
// SetAttr allows an attribute value with the specified name to be changed.
// If the attribute did not previously exist, it will be created.
func (n *Node) SetAttr(key, value string) {
name := newXMLName(key)
for i, attr := range n.Attr {
if attr.Name == name {
n.Attr[i].Value = value
return
}
}
AddAttr(n, key, value)
}
// RemoveAttr removes the attribute with the specified name.
func (n *Node) RemoveAttr(key string) {
name := newXMLName(key)
for i, attr := range n.Attr {
if attr.Name == name {
n.Attr = append(n.Attr[:i], n.Attr[i+1:]...)
return
}
}
}
// AddChild adds a new node 'n' to a node 'parent' as its last child.
func AddChild(parent, n *Node) {
n.Parent = parent
n.NextSibling = nil
if parent.FirstChild == nil {
parent.FirstChild = n
n.PrevSibling = nil
} else {
parent.LastChild.NextSibling = n
n.PrevSibling = parent.LastChild
}
parent.LastChild = n
}
// AddSibling adds a new node 'n' as a last node of sibling chain for a given node 'sibling'.
func AddSibling(sibling, n *Node) {
for t := sibling.NextSibling; t != nil; t = t.NextSibling {
sibling = t
}
n.Parent = sibling.Parent
sibling.NextSibling = n
n.PrevSibling = sibling
n.NextSibling = nil
if sibling.Parent != nil {
sibling.Parent.LastChild = n
}
}
// AddImmediateSibling adds a new node 'n' as immediate sibling a given node 'sibling'.
func AddImmediateSibling(sibling, n *Node) {
n.Parent = sibling.Parent
n.NextSibling = sibling.NextSibling
sibling.NextSibling = n
n.PrevSibling = sibling
if n.NextSibling != nil {
n.NextSibling.PrevSibling = n
} else if n.Parent != nil {
sibling.Parent.LastChild = n
}
}
// RemoveFromTree removes a node and its subtree from the document
// tree it is in. If the node is the root of the tree, then it's no-op.
func RemoveFromTree(n *Node) {
if n.Parent == nil {
return
}
if n.Parent.FirstChild == n {
if n.Parent.LastChild == n {
n.Parent.FirstChild = nil
n.Parent.LastChild = nil
} else {
n.Parent.FirstChild = n.NextSibling
n.NextSibling.PrevSibling = nil
}
} else {
if n.Parent.LastChild == n {
n.Parent.LastChild = n.PrevSibling
n.PrevSibling.NextSibling = nil
} else {
n.PrevSibling.NextSibling = n.NextSibling
n.NextSibling.PrevSibling = n.PrevSibling
}
}
n.Parent = nil
n.PrevSibling = nil
n.NextSibling = nil
}
// GetRoot returns a root of the tree where 'n' is a node.
func GetRoot(n *Node) *Node {
if n == nil {
return nil
}
root := n
for root.Parent != nil {
root = root.Parent
}
return root
}
+33
View File
@@ -0,0 +1,33 @@
package xmlquery
import (
"encoding/xml"
"io"
)
type ParserOptions struct {
Decoder *DecoderOptions
}
func (options ParserOptions) apply(parser *parser) {
if options.Decoder != nil {
(*options.Decoder).apply(parser.decoder)
}
}
// DecoderOptions implement the very same options than the standard
// encoding/xml package. Please refer to this documentation:
// https://golang.org/pkg/encoding/xml/#Decoder
type DecoderOptions struct {
Strict bool
AutoClose []string
Entity map[string]string
CharsetReader func(charset string, input io.Reader) (io.Reader, error)
}
func (options DecoderOptions) apply(decoder *xml.Decoder) {
decoder.Strict = options.Strict
decoder.AutoClose = options.AutoClose
decoder.Entity = options.Entity
decoder.CharsetReader = options.CharsetReader
}
+430
View File
@@ -0,0 +1,430 @@
package xmlquery
import (
"bufio"
"bytes"
"encoding/xml"
"fmt"
"io"
"net/http"
"regexp"
"strings"
"sync"
"github.com/antchfx/xpath"
"golang.org/x/net/html/charset"
)
var xmlMIMERegex = regexp.MustCompile(`(?i)((application|image|message|model)/((\w|\.|-)+\+?)?|text/)(wb)?xml`)
// LoadURL loads the XML document from the specified URL.
func LoadURL(url string) (*Node, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
// Make sure the Content-Type has a valid XML MIME type
if xmlMIMERegex.MatchString(resp.Header.Get("Content-Type")) {
return Parse(resp.Body)
}
return nil, fmt.Errorf("invalid XML document(%s)", resp.Header.Get("Content-Type"))
}
// Parse returns the parse tree for the XML from the given Reader.
func Parse(r io.Reader) (*Node, error) {
return ParseWithOptions(r, ParserOptions{})
}
// ParseWithOptions is like parse, but with custom options
func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
p := createParser(r)
options.apply(p)
var err error
for err == nil {
_, err = p.parse()
}
if err == io.EOF {
// additional check for validity
// according to: https://www.w3.org/TR/xml
// the document MUST contain at least ONE element
valid := false
for doc := p.doc; doc != nil; doc = doc.NextSibling {
for node := doc.FirstChild; node != nil; node = node.NextSibling {
if node.Type == ElementNode {
valid = true
break
}
}
}
if !valid {
return nil, fmt.Errorf("xmlquery: invalid XML document")
}
return p.doc, nil
}
return nil, err
}
type parser struct {
decoder *xml.Decoder
doc *Node
level int
prev *Node
streamElementXPath *xpath.Expr // Under streaming mode, this specifies the xpath to the target element node(s).
streamElementFilter *xpath.Expr // If specified, it provides further filtering on the target element.
streamNode *Node // Need to remember the last target node So we can clean it up upon next Read() call.
streamNodePrev *Node // Need to remember target node's prev so upon target node removal, we can restore correct prev.
reader *cachedReader // Need to maintain a reference to the reader, so we can determine whether a node contains CDATA.
once sync.Once
space2prefix map[string]*xmlnsPrefix
}
type xmlnsPrefix struct {
name string
level int
}
func createParser(r io.Reader) *parser {
reader := newCachedReader(bufio.NewReader(r))
p := &parser{
decoder: xml.NewDecoder(reader),
doc: &Node{Type: DocumentNode},
level: 0,
reader: reader,
}
if p.decoder.CharsetReader == nil {
p.decoder.CharsetReader = charset.NewReaderLabel
}
p.prev = p.doc
return p
}
func (p *parser) parse() (*Node, error) {
p.once.Do(func() {
p.space2prefix = map[string]*xmlnsPrefix{"http://www.w3.org/XML/1998/namespace": {name: "xml", level: 0}}
})
var streamElementNodeCounter int
for {
p.reader.StartCaching()
tok, err := p.decoder.Token()
p.reader.StopCaching()
if err != nil {
return nil, err
}
switch tok := tok.(type) {
case xml.StartElement:
if p.level == 0 {
// mising XML declaration
attributes := make([]Attr, 1)
attributes[0].Name = xml.Name{Local: "version"}
attributes[0].Value = "1.0"
node := &Node{
Type: DeclarationNode,
Data: "xml",
Attr: attributes,
level: 1,
}
AddChild(p.prev, node)
p.level = 1
p.prev = node
}
for _, att := range tok.Attr {
if att.Name.Local == "xmlns" {
// https://github.com/antchfx/xmlquery/issues/67
if prefix, ok := p.space2prefix[att.Value]; !ok || (ok && prefix.level >= p.level) {
p.space2prefix[att.Value] = &xmlnsPrefix{name: "", level: p.level} // reset empty if exist the default namespace
}
} else if att.Name.Space == "xmlns" {
// maybe there are have duplicate NamespaceURL?
p.space2prefix[att.Value] = &xmlnsPrefix{name: att.Name.Local, level: p.level}
}
}
if space := tok.Name.Space; space != "" {
if _, found := p.space2prefix[space]; !found && p.decoder.Strict {
return nil, fmt.Errorf("xmlquery: invalid XML document, namespace %s is missing", space)
}
}
attributes := make([]Attr, len(tok.Attr))
for i, att := range tok.Attr {
name := att.Name
if prefix, ok := p.space2prefix[name.Space]; ok {
name.Space = prefix.name
}
attributes[i] = Attr{
Name: name,
Value: att.Value,
NamespaceURI: att.Name.Space,
}
}
node := &Node{
Type: ElementNode,
Data: tok.Name.Local,
NamespaceURI: tok.Name.Space,
Attr: attributes,
level: p.level,
}
if p.level == p.prev.level {
AddSibling(p.prev, node)
} else if p.level > p.prev.level {
AddChild(p.prev, node)
} else if p.level < p.prev.level {
for i := p.prev.level - p.level; i > 1; i-- {
p.prev = p.prev.Parent
}
AddSibling(p.prev.Parent, node)
}
if node.NamespaceURI != "" {
if v, ok := p.space2prefix[node.NamespaceURI]; ok {
cached := string(p.reader.CacheWithLimit(len(v.name) + len(node.Data) + 2))
if strings.HasPrefix(cached, fmt.Sprintf("%s:%s", v.name, node.Data)) || strings.HasPrefix(cached, fmt.Sprintf("<%s:%s", v.name, node.Data)) {
node.Prefix = v.name
}
}
}
// If we're in the streaming mode, we need to remember the node if it is the target node
// so that when we finish processing the node's EndElement, we know how/what to return to
// caller. Also we need to remove the target node from the tree upon next Read() call so
// memory doesn't grow unbounded.
if p.streamElementXPath != nil {
if p.streamNode == nil {
if QuerySelector(p.doc, p.streamElementXPath) != nil {
p.streamNode = node
p.streamNodePrev = p.prev
streamElementNodeCounter = 1
}
} else {
streamElementNodeCounter++
}
}
p.prev = node
p.level++
case xml.EndElement:
p.level--
// If we're in streaming mode, and we already have a potential streaming
// target node identified (p.streamNode != nil) then we need to check if
// this is the real one we want to return to caller.
if p.streamNode != nil {
streamElementNodeCounter--
if streamElementNodeCounter == 0 {
// Now we know this element node is the at least passing the initial
// p.streamElementXPath check and is a potential target node candidate.
// We need to have 1 more check with p.streamElementFilter (if given) to
// ensure it is really the element node we want.
// The reason we need a two-step check process is because the following
// situation:
// <AAA><BBB>b1</BBB></AAA>
// And say the p.streamElementXPath = "/AAA/BBB[. != 'b1']". Now during
// xml.StartElement time, the <BBB> node is still empty, so it will pass
// the p.streamElementXPath check. However, eventually we know this <BBB>
// shouldn't be returned to the caller. Having a second more fine-grained
// filter check ensures that. So in this case, the caller should really
// setup the stream parser with:
// streamElementXPath = "/AAA/BBB["
// streamElementFilter = "/AAA/BBB[. != 'b1']"
if p.streamElementFilter == nil || QuerySelector(p.doc, p.streamElementFilter) != nil {
return p.streamNode, nil
}
// otherwise, this isn't our target node, clean things up.
// note we also remove the underlying *Node from the node tree, to prevent
// future stream node candidate selection error.
RemoveFromTree(p.streamNode)
p.prev = p.streamNodePrev
p.streamNode = nil
p.streamNodePrev = nil
}
}
case xml.CharData:
// First, normalize the cache...
cached := bytes.ToUpper(p.reader.CacheWithLimit(9))
nodeType := TextNode
if bytes.HasPrefix(cached, []byte("<![CDATA[")) || bytes.HasPrefix(cached, []byte("![CDATA[")) {
nodeType = CharDataNode
}
node := &Node{Type: nodeType, Data: string(tok), level: p.level}
if p.level == p.prev.level {
AddSibling(p.prev, node)
} else if p.level > p.prev.level {
AddChild(p.prev, node)
} else if p.level < p.prev.level {
for i := p.prev.level - p.level; i > 1; i-- {
p.prev = p.prev.Parent
}
AddSibling(p.prev.Parent, node)
}
case xml.Comment:
node := &Node{Type: CommentNode, Data: string(tok), level: p.level}
if p.level == p.prev.level {
AddSibling(p.prev, node)
} else if p.level > p.prev.level {
AddChild(p.prev, node)
} else if p.level < p.prev.level {
for i := p.prev.level - p.level; i > 1; i-- {
p.prev = p.prev.Parent
}
AddSibling(p.prev.Parent, node)
}
case xml.ProcInst: // Processing Instruction
if p.prev.Type != DeclarationNode {
p.level++
}
node := &Node{Type: DeclarationNode, Data: tok.Target, level: p.level}
pairs := strings.Split(string(tok.Inst), " ")
for _, pair := range pairs {
pair = strings.TrimSpace(pair)
if i := strings.Index(pair, "="); i > 0 {
AddAttr(node, pair[:i], strings.Trim(pair[i+1:], `"'`))
}
}
if p.level == p.prev.level {
AddSibling(p.prev, node)
} else if p.level > p.prev.level {
AddChild(p.prev, node)
} else if p.level < p.prev.level {
for i := p.prev.level - p.level; i > 1; i-- {
p.prev = p.prev.Parent
}
AddSibling(p.prev.Parent, node)
}
p.prev = node
case xml.Directive:
node := &Node{Type: NotationNode, Data: string(tok), level: p.level}
if p.level == p.prev.level {
AddSibling(p.prev, node)
} else if p.level > p.prev.level {
AddChild(p.prev, node)
} else if p.level < p.prev.level {
for i := p.prev.level - p.level; i > 1; i-- {
p.prev = p.prev.Parent
}
AddSibling(p.prev.Parent, node)
}
}
}
}
// StreamParser enables loading and parsing an XML document in a streaming
// fashion.
type StreamParser struct {
p *parser
}
// CreateStreamParser creates a StreamParser. Argument streamElementXPath is
// required.
// Argument streamElementFilter is optional and should only be used in advanced
// scenarios.
//
// Scenario 1: simple case:
//
// xml := `<AAA><BBB>b1</BBB><BBB>b2</BBB></AAA>`
// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB")
// if err != nil {
// panic(err)
// }
// for {
// n, err := sp.Read()
// if err != nil {
// break
// }
// fmt.Println(n.OutputXML(true))
// }
//
// Output will be:
//
// <BBB>b1</BBB>
// <BBB>b2</BBB>
//
// Scenario 2: advanced case:
//
// xml := `<AAA><BBB>b1</BBB><BBB>b2</BBB></AAA>`
// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB", "/AAA/BBB[. != 'b1']")
// if err != nil {
// panic(err)
// }
// for {
// n, err := sp.Read()
// if err != nil {
// break
// }
// fmt.Println(n.OutputXML(true))
// }
//
// Output will be:
//
// <BBB>b2</BBB>
//
// As the argument names indicate, streamElementXPath should be used for
// providing xpath query pointing to the target element node only, no extra
// filtering on the element itself or its children; while streamElementFilter,
// if needed, can provide additional filtering on the target element and its
// children.
//
// CreateStreamParser returns an error if either streamElementXPath or
// streamElementFilter, if provided, cannot be successfully parsed and compiled
// into a valid xpath query.
func CreateStreamParser(r io.Reader, streamElementXPath string, streamElementFilter ...string) (*StreamParser, error) {
return CreateStreamParserWithOptions(r, ParserOptions{}, streamElementXPath, streamElementFilter...)
}
// CreateStreamParserWithOptions is like CreateStreamParser, but with custom options
func CreateStreamParserWithOptions(
r io.Reader,
options ParserOptions,
streamElementXPath string,
streamElementFilter ...string,
) (*StreamParser, error) {
elemXPath, err := getQuery(streamElementXPath)
if err != nil {
return nil, fmt.Errorf("invalid streamElementXPath '%s', err: %s", streamElementXPath, err.Error())
}
elemFilter := (*xpath.Expr)(nil)
if len(streamElementFilter) > 0 {
elemFilter, err = getQuery(streamElementFilter[0])
if err != nil {
return nil, fmt.Errorf("invalid streamElementFilter '%s', err: %s", streamElementFilter[0], err.Error())
}
}
parser := createParser(r)
options.apply(parser)
sp := &StreamParser{
p: parser,
}
sp.p.streamElementXPath = elemXPath
sp.p.streamElementFilter = elemFilter
return sp, nil
}
// Read returns a target node that satisfies the XPath specified by caller at
// StreamParser creation time. If there is no more satisfying target nodes after
// reading the rest of the XML document, io.EOF will be returned. At any time,
// any XML parsing error encountered will be returned, and the stream parsing
// stopped. Calling Read() after an error is returned (including io.EOF) results
// undefined behavior. Also note, due to the streaming nature, calling Read()
// will automatically remove any previous target node(s) from the document tree.
func (sp *StreamParser) Read() (*Node, error) {
// Because this is a streaming read, we need to release/remove last
// target node from the node tree to free up memory.
if sp.p.streamNode != nil {
// We need to remove all siblings before the current stream node,
// because the document may contain unwanted nodes between the target
// ones (for example new line text node), which would otherwise
// accumulate as first childs, and slow down the stream over time
for sp.p.streamNode.PrevSibling != nil {
RemoveFromTree(sp.p.streamNode.PrevSibling)
}
sp.p.prev = sp.p.streamNode.Parent
RemoveFromTree(sp.p.streamNode)
sp.p.streamNode = nil
sp.p.streamNodePrev = nil
}
return sp.p.parse()
}
+304
View File
@@ -0,0 +1,304 @@
/*
Package xmlquery provides extract data from XML documents using XPath expression.
*/
package xmlquery
import (
"fmt"
"strings"
"github.com/antchfx/xpath"
)
// SelectElements finds child elements with the specified name.
func (n *Node) SelectElements(name string) []*Node {
return Find(n, name)
}
// SelectElement finds child elements with the specified name.
func (n *Node) SelectElement(name string) *Node {
return FindOne(n, name)
}
// SelectAttr returns the attribute value with the specified name.
func (n *Node) SelectAttr(name string) string {
if n.Type == AttributeNode {
if n.Data == name {
return n.InnerText()
}
return ""
}
xmlName := newXMLName(name)
for _, attr := range n.Attr {
if attr.Name == xmlName {
return attr.Value
}
}
return ""
}
var _ xpath.NodeNavigator = &NodeNavigator{}
// CreateXPathNavigator creates a new xpath.NodeNavigator for the specified
// XML Node.
func CreateXPathNavigator(top *Node) *NodeNavigator {
return &NodeNavigator{curr: top, root: top, attr: -1}
}
func getCurrentNode(it *xpath.NodeIterator) *Node {
n := it.Current().(*NodeNavigator)
if n.NodeType() == xpath.AttributeNode {
childNode := &Node{
Type: TextNode,
Data: n.Value(),
}
return &Node{
Parent: n.curr,
Type: AttributeNode,
Data: n.LocalName(),
FirstChild: childNode,
LastChild: childNode,
}
}
return n.curr
}
// Find is like QueryAll but panics if `expr` is not a valid XPath expression.
// See `QueryAll()` function.
func Find(top *Node, expr string) []*Node {
nodes, err := QueryAll(top, expr)
if err != nil {
panic(err)
}
return nodes
}
// FindOne is like Query but panics if `expr` is not a valid XPath expression.
// See `Query()` function.
func FindOne(top *Node, expr string) *Node {
node, err := Query(top, expr)
if err != nil {
panic(err)
}
return node
}
// QueryAll searches the XML Node that matches by the specified XPath expr.
// Returns an error if the expression `expr` cannot be parsed.
func QueryAll(top *Node, expr string) ([]*Node, error) {
exp, err := getQuery(expr)
if err != nil {
return nil, err
}
return QuerySelectorAll(top, exp), nil
}
// Query searches the XML Node that matches by the specified XPath expr,
// and returns first matched element.
func Query(top *Node, expr string) (*Node, error) {
exp, err := getQuery(expr)
if err != nil {
return nil, err
}
return QuerySelector(top, exp), nil
}
// QuerySelectorAll searches all of the XML Node that matches the specified
// XPath selectors.
func QuerySelectorAll(top *Node, selector *xpath.Expr) []*Node {
t := selector.Select(CreateXPathNavigator(top))
var elems []*Node
for t.MoveNext() {
elems = append(elems, getCurrentNode(t))
}
return elems
}
// QuerySelector returns the first matched XML Node by the specified XPath
// selector.
func QuerySelector(top *Node, selector *xpath.Expr) *Node {
t := selector.Select(CreateXPathNavigator(top))
if t.MoveNext() {
return getCurrentNode(t)
}
return nil
}
// FindEach searches the html.Node and calls functions cb.
// Important: this method is deprecated, instead, use for .. = range Find(){}.
func FindEach(top *Node, expr string, cb func(int, *Node)) {
for i, n := range Find(top, expr) {
cb(i, n)
}
}
// FindEachWithBreak functions the same as FindEach but allows to break the loop
// by returning false from the callback function `cb`.
// Important: this method is deprecated, instead, use .. = range Find(){}.
func FindEachWithBreak(top *Node, expr string, cb func(int, *Node) bool) {
for i, n := range Find(top, expr) {
if !cb(i, n) {
break
}
}
}
type NodeNavigator struct {
root, curr *Node
attr int
}
func (x *NodeNavigator) Current() *Node {
return x.curr
}
func (x *NodeNavigator) NodeType() xpath.NodeType {
switch x.curr.Type {
case CommentNode:
return xpath.CommentNode
case TextNode, CharDataNode, NotationNode:
return xpath.TextNode
case DeclarationNode, DocumentNode:
return xpath.RootNode
case ElementNode:
if x.attr != -1 {
return xpath.AttributeNode
}
return xpath.ElementNode
}
panic(fmt.Sprintf("unknown XML node type: %v", x.curr.Type))
}
func (x *NodeNavigator) LocalName() string {
if x.attr != -1 {
return x.curr.Attr[x.attr].Name.Local
}
return x.curr.Data
}
func (x *NodeNavigator) Prefix() string {
if x.NodeType() == xpath.AttributeNode {
if x.attr != -1 {
return x.curr.Attr[x.attr].Name.Space
}
return ""
}
return x.curr.Prefix
}
func (x *NodeNavigator) NamespaceURL() string {
if x.attr != -1 {
return x.curr.Attr[x.attr].NamespaceURI
}
return x.curr.NamespaceURI
}
func (x *NodeNavigator) Value() string {
switch x.curr.Type {
case CommentNode:
return x.curr.Data
case ElementNode:
if x.attr != -1 {
return x.curr.Attr[x.attr].Value
}
return x.curr.InnerText()
case TextNode:
return x.curr.Data
}
return ""
}
func (x *NodeNavigator) Copy() xpath.NodeNavigator {
n := *x
return &n
}
func (x *NodeNavigator) MoveToRoot() {
x.curr = x.root
}
func (x *NodeNavigator) MoveToParent() bool {
if x.attr != -1 {
x.attr = -1
return true
} else if node := x.curr.Parent; node != nil {
x.curr = node
return true
}
return false
}
func (x *NodeNavigator) MoveToNextAttribute() bool {
if x.attr >= len(x.curr.Attr)-1 {
return false
}
x.attr++
return true
}
func (x *NodeNavigator) MoveToChild() bool {
if x.attr != -1 {
return false
}
if node := x.curr.FirstChild; node != nil {
x.curr = node
return true
}
return false
}
func (x *NodeNavigator) MoveToFirst() bool {
if x.attr != -1 || x.curr.PrevSibling == nil {
return false
}
for {
node := x.curr.PrevSibling
if node == nil {
break
}
x.curr = node
}
return true
}
func (x *NodeNavigator) String() string {
return x.Value()
}
func (x *NodeNavigator) MoveToNext() bool {
if x.attr != -1 {
return false
}
for node := x.curr.NextSibling; node != nil; node = x.curr.NextSibling {
x.curr = node
if x.curr.Type != TextNode || strings.TrimSpace(x.curr.Data) != "" {
return true
}
}
return false
}
func (x *NodeNavigator) MoveToPrevious() bool {
if x.attr != -1 {
return false
}
for node := x.curr.PrevSibling; node != nil; node = x.curr.PrevSibling {
x.curr = node
if x.curr.Type != TextNode || strings.TrimSpace(x.curr.Data) != "" {
return true
}
}
return false
}
func (x *NodeNavigator) MoveTo(other xpath.NodeNavigator) bool {
node, ok := other.(*NodeNavigator)
if !ok || node.root != x.root {
return false
}
x.curr = node.curr
x.attr = node.attr
return true
}
+32
View File
@@ -0,0 +1,32 @@
# vscode
.vscode
debug
*.test
./build
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof
+17
View File
@@ -0,0 +1,17 @@
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
+167
View File
@@ -0,0 +1,167 @@
# XPath
[![GoDoc](https://godoc.org/github.com/antchfx/xpath?status.svg)](https://godoc.org/github.com/antchfx/xpath)
[![Coverage Status](https://coveralls.io/repos/github/antchfx/xpath/badge.svg?branch=master)](https://coveralls.io/github/antchfx/xpath?branch=master)
[![Build Status](https://github.com/antchfx/xpath/actions/workflows/testing.yml/badge.svg)](https://github.com/antchfx/xpath/actions/workflows/testing.yml)
[![Go Report Card](https://goreportcard.com/badge/github.com/antchfx/xpath)](https://goreportcard.com/report/github.com/antchfx/xpath)
XPath is Go package provides selecting nodes from XML, HTML or other documents using XPath expression.
# Implementation
- [htmlquery](https://github.com/antchfx/htmlquery) - an XPath query package for HTML document
- [xmlquery](https://github.com/antchfx/xmlquery) - an XPath query package for XML document.
- [jsonquery](https://github.com/antchfx/jsonquery) - an XPath query package for JSON document
# Supported Features
#### The basic XPath patterns.
> The basic XPath patterns cover 90% of the cases that most stylesheets will need.
- `node` : Selects all child elements with nodeName of node.
- `*` : Selects all child elements.
- `@attr` : Selects the attribute attr.
- `@*` : Selects all attributes.
- `node()` : Matches an org.w3c.dom.Node.
- `text()` : Matches a org.w3c.dom.Text node.
- `comment()` : Matches a comment.
- `.` : Selects the current node.
- `..` : Selects the parent of current node.
- `/` : Selects the document node.
- `a[expr]` : Select only those nodes matching a which also satisfy the expression expr.
- `a[n]` : Selects the nth matching node matching a When a filter's expression is a number, XPath selects based on position.
- `a/b` : For each node matching a, add the nodes matching b to the result.
- `a//b` : For each node matching a, add the descendant nodes matching b to the result.
- `//b` : Returns elements in the entire document matching b.
- `a|b` : All nodes matching a or b, union operation(not boolean or).
- `(a, b, c)` : Evaluates each of its operands and concatenates the resulting sequences, in order, into a single result sequence
- `(a/b)` : Selects all matches nodes as grouping set.
#### Node Axes
- `child::*` : The child axis selects children of the current node.
- `child::node()`: Selects all the children of the context node.
- `child::text()`: Selects all text node children of the context node.
- `descendant::*` : The descendant axis selects descendants of the current node. It is equivalent to '//'.
- `descendant-or-self::*` : Selects descendants including the current node.
- `attribute::*` : Selects attributes of the current element. It is equivalent to @\*
- `following-sibling::*` : Selects nodes after the current node.
- `preceding-sibling::*` : Selects nodes before the current node.
- `following::*` : Selects the first matching node following in document order, excluding descendants.
- `preceding::*` : Selects the first matching node preceding in document order, excluding ancestors.
- `parent::*` : Selects the parent if it matches. The '..' pattern from the core is equivalent to 'parent::node()'.
- `ancestor::*` : Selects matching ancestors.
- `ancestor-or-self::*` : Selects ancestors including the current node.
- `self::*` : Selects the current node. '.' is equivalent to 'self::node()'.
#### Expressions
The gxpath supported three types: number, boolean, string.
- `path` : Selects nodes based on the path.
- `a = b` : Standard comparisons.
- `a = b` : True if a equals b.
- `a != b` : True if a is not equal to b.
- `a < b` : True if a is less than b.
- `a <= b` : True if a is less than or equal to b.
- `a > b` : True if a is greater than b.
- `a >= b` : True if a is greater than or equal to b.
- `a + b` : Arithmetic expressions.
- `- a` Unary minus
- `a + b` : Addition
- `a - b` : Subtraction
- `a * b` : Multiplication
- `a div b` : Division
- `a mod b` : Modulus (division remainder)
- `a or b` : Boolean `or` operation.
- `a and b` : Boolean `and` operation.
- `(expr)` : Parenthesized expressions.
- `fun(arg1, ..., argn)` : Function calls:
| Function | Supported |
| ----------------------- | --------- |
| `boolean()` | ✓ |
| `ceiling()` | ✓ |
| `choose()` | ✗ |
| `concat()` | ✓ |
| `contains()` | ✓ |
| `count()` | ✓ |
| `current()` | ✗ |
| `document()` | ✗ |
| `element-available()` | ✗ |
| `ends-with()` | ✓ |
| `false()` | ✓ |
| `floor()` | ✓ |
| `format-number()` | ✗ |
| `function-available()` | ✗ |
| `generate-id()` | ✗ |
| `id()` | ✗ |
| `key()` | ✗ |
| `lang()` | ✗ |
| `last()` | ✓ |
| `local-name()` | ✓ |
| `lower-case()`[^1] | ✓ |
| `matches()` | ✓ |
| `name()` | ✓ |
| `namespace-uri()` | ✓ |
| `normalize-space()` | ✓ |
| `not()` | ✓ |
| `number()` | ✓ |
| `position()` | ✓ |
| `replace()` | ✓ |
| `reverse()` | ✓ |
| `round()` | ✓ |
| `starts-with()` | ✓ |
| `string()` | ✓ |
| `string-join()`[^1] | ✓ |
| `string-length()` | ✓ |
| `substring()` | ✓ |
| `substring-after()` | ✓ |
| `substring-before()` | ✓ |
| `sum()` | ✓ |
| `system-property()` | ✗ |
| `translate()` | ✓ |
| `true()` | ✓ |
| `unparsed-entity-url()` | ✗ |
[^1]: XPath-2.0 expression
+718
View File
@@ -0,0 +1,718 @@
package xpath
import (
"errors"
"fmt"
)
type flag int
var flagsEnum = struct {
None flag
SmartDesc flag
PosFilter flag
Filter flag
Condition flag
}{
None: 0,
SmartDesc: 1,
PosFilter: 2,
Filter: 4,
Condition: 8,
}
type builderProp int
var builderProps = struct {
None builderProp
PosFilter builderProp
HasPosition builderProp
HasLast builderProp
NonFlat builderProp
}{
None: 0,
PosFilter: 1,
HasPosition: 2,
HasLast: 4,
NonFlat: 8,
}
// builder provides building an XPath expressions.
type builder struct {
parseDepth int
firstInput query
}
// axisPredicate creates a predicate to predicating for this axis node.
func axisPredicate(root *axisNode) func(NodeNavigator) bool {
nametest := root.LocalName != "" || root.Prefix != ""
predicate := func(n NodeNavigator) bool {
if root.typeTest == n.NodeType() || root.typeTest == allNode {
if nametest {
type namespaceURL interface {
NamespaceURL() string
}
if ns, ok := n.(namespaceURL); ok && root.hasNamespaceURI {
return root.LocalName == n.LocalName() && root.namespaceURI == ns.NamespaceURL()
}
if root.LocalName == n.LocalName() && root.Prefix == n.Prefix() {
return true
}
} else {
return true
}
}
return false
}
return predicate
}
// processAxis processes a query for the XPath axis node.
func (b *builder) processAxis(root *axisNode, flags flag, props *builderProp) (query, error) {
var (
err error
qyInput query
qyOutput query
)
b.firstInput = nil
predicate := axisPredicate(root)
if root.Input == nil {
qyInput = &contextQuery{}
*props = builderProps.None
} else {
inputFlags := flagsEnum.None
if (flags & flagsEnum.Filter) == 0 {
if root.AxisType == "child" && (root.Input.Type() == nodeAxis) {
if input := root.Input.(*axisNode); input.AxisType == "descendant-or-self" {
var qyGrandInput query
if input.Input != nil {
qyGrandInput, err = b.processNode(input.Input, flagsEnum.SmartDesc, props)
if err != nil {
return nil, err
}
} else {
qyGrandInput = &contextQuery{}
}
qyOutput = &descendantQuery{name: root.LocalName, Input: qyGrandInput, Predicate: predicate, Self: false}
*props |= builderProps.NonFlat
return qyOutput, nil
}
}
if root.AxisType == "descendant" || root.AxisType == "descendant-or-self" {
inputFlags |= flagsEnum.SmartDesc
}
}
qyInput, err = b.processNode(root.Input, inputFlags, props)
if err != nil {
return nil, err
}
}
switch root.AxisType {
case "ancestor":
qyOutput = &ancestorQuery{name: root.LocalName, Input: qyInput, Predicate: predicate}
*props |= builderProps.NonFlat
case "ancestor-or-self":
qyOutput = &ancestorQuery{name: root.LocalName, Input: qyInput, Predicate: predicate, Self: true}
*props |= builderProps.NonFlat
case "attribute":
qyOutput = &attributeQuery{name: root.LocalName, Input: qyInput, Predicate: predicate}
case "child":
if (*props & builderProps.NonFlat) == 0 {
qyOutput = &childQuery{name: root.LocalName, Input: qyInput, Predicate: predicate}
} else {
qyOutput = &cachedChildQuery{name: root.LocalName, Input: qyInput, Predicate: predicate}
}
case "descendant":
if (flags & flagsEnum.SmartDesc) != flagsEnum.None {
qyOutput = &descendantOverDescendantQuery{name: root.LocalName, Input: qyInput, MatchSelf: false, Predicate: predicate}
} else {
qyOutput = &descendantQuery{name: root.LocalName, Input: qyInput, Predicate: predicate}
}
*props |= builderProps.NonFlat
case "descendant-or-self":
if (flags & flagsEnum.SmartDesc) != flagsEnum.None {
qyOutput = &descendantOverDescendantQuery{name: root.LocalName, Input: qyInput, MatchSelf: true, Predicate: predicate}
} else {
qyOutput = &descendantQuery{name: root.LocalName, Input: qyInput, Predicate: predicate, Self: true}
}
*props |= builderProps.NonFlat
case "following":
qyOutput = &followingQuery{Input: qyInput, Predicate: predicate}
*props |= builderProps.NonFlat
case "following-sibling":
qyOutput = &followingQuery{Input: qyInput, Predicate: predicate, Sibling: true}
case "parent":
qyOutput = &parentQuery{Input: qyInput, Predicate: predicate}
case "preceding":
qyOutput = &precedingQuery{Input: qyInput, Predicate: predicate}
*props |= builderProps.NonFlat
case "preceding-sibling":
qyOutput = &precedingQuery{Input: qyInput, Predicate: predicate, Sibling: true}
case "self":
qyOutput = &selfQuery{Input: qyInput, Predicate: predicate}
case "namespace":
// haha,what will you do someting??
default:
err = fmt.Errorf("unknown axe type: %s", root.AxisType)
return nil, err
}
return qyOutput, nil
}
func canBeNumber(q query) bool {
if q.ValueType() != xpathResultType.Any {
return q.ValueType() == xpathResultType.Number
}
return true
}
// processFilterNode builds query for the XPath filter predicate.
func (b *builder) processFilter(root *filterNode, flags flag, props *builderProp) (query, error) {
first := (flags & flagsEnum.Filter) == 0
qyInput, err := b.processNode(root.Input, (flags | flagsEnum.Filter), props)
if err != nil {
return nil, err
}
firstInput := b.firstInput
var propsCond builderProp
cond, err := b.processNode(root.Condition, flags, &propsCond)
if err != nil {
return nil, err
}
// Checking whether is number
if canBeNumber(cond) || ((propsCond & (builderProps.HasPosition | builderProps.HasLast)) != 0) {
propsCond |= builderProps.HasPosition
flags |= flagsEnum.PosFilter
}
if root.Input.Type() != nodeFilter {
*props &= ^builderProps.PosFilter
}
if (propsCond & builderProps.HasPosition) != 0 {
*props |= builderProps.PosFilter
}
if (propsCond & builderProps.HasPosition) != builderProps.None {
if (propsCond & builderProps.HasLast) != 0 {
// https://github.com/antchfx/xpath/issues/76
// https://github.com/antchfx/xpath/issues/78
if qyFunc, ok := cond.(*functionQuery); ok {
switch qyFunc.Input.(type) {
case *filterQuery:
cond = &lastFuncQuery{Input: qyFunc.Input}
}
}
}
}
merge := (qyInput.Properties() & queryProps.Merge) != 0
if first && firstInput != nil {
if merge && ((*props & builderProps.PosFilter) != 0) {
var (
rootQuery = &contextQuery{}
parent query
)
switch axisQuery := firstInput.(type) {
case *ancestorQuery:
if _, ok := axisQuery.Input.(*contextQuery); !ok {
parent = axisQuery.Input
axisQuery.Input = rootQuery
}
case *attributeQuery:
if _, ok := axisQuery.Input.(*contextQuery); !ok {
parent = axisQuery.Input
axisQuery.Input = rootQuery
}
case *childQuery:
if _, ok := axisQuery.Input.(*contextQuery); !ok {
parent = axisQuery.Input
axisQuery.Input = rootQuery
}
case *cachedChildQuery:
if _, ok := axisQuery.Input.(*contextQuery); !ok {
parent = axisQuery.Input
axisQuery.Input = rootQuery
}
case *descendantQuery:
if _, ok := axisQuery.Input.(*contextQuery); !ok {
parent = axisQuery.Input
axisQuery.Input = rootQuery
}
case *followingQuery:
if _, ok := axisQuery.Input.(*contextQuery); !ok {
parent = axisQuery.Input
axisQuery.Input = rootQuery
}
case *precedingQuery:
if _, ok := axisQuery.Input.(*contextQuery); !ok {
parent = axisQuery.Input
axisQuery.Input = rootQuery
}
case *parentQuery:
if _, ok := axisQuery.Input.(*contextQuery); !ok {
parent = axisQuery.Input
axisQuery.Input = rootQuery
}
case *selfQuery:
if _, ok := axisQuery.Input.(*contextQuery); !ok {
parent = axisQuery.Input
axisQuery.Input = rootQuery
}
case *groupQuery:
if _, ok := axisQuery.Input.(*contextQuery); !ok {
parent = axisQuery.Input
axisQuery.Input = rootQuery
}
case *descendantOverDescendantQuery:
if _, ok := axisQuery.Input.(*contextQuery); !ok {
parent = axisQuery.Input
axisQuery.Input = rootQuery
}
}
b.firstInput = nil
child := &filterQuery{Input: qyInput, Predicate: cond, NoPosition: false}
if parent != nil {
return &mergeQuery{Input: parent, Child: child}, nil
}
return child, nil
}
b.firstInput = nil
}
resultQuery := &filterQuery{
Input: qyInput,
Predicate: cond,
NoPosition: (propsCond & builderProps.HasPosition) == 0,
}
return resultQuery, nil
}
// processFunctionNode processes query for the XPath function node.
func (b *builder) processFunction(root *functionNode, props *builderProp) (query, error) {
// Reset builder props
*props = builderProps.None
var qyOutput query
switch root.FuncName {
case "lower-case":
arg, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: lowerCaseFunc(arg)}
case "starts-with":
arg1, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
arg2, err := b.processNode(root.Args[1], flagsEnum.None, props)
if err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: startwithFunc(arg1, arg2)}
case "ends-with":
arg1, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
arg2, err := b.processNode(root.Args[1], flagsEnum.None, props)
if err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: endwithFunc(arg1, arg2)}
case "contains":
arg1, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
arg2, err := b.processNode(root.Args[1], flagsEnum.None, props)
if err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: containsFunc(arg1, arg2)}
case "matches":
//matches(string , pattern)
if len(root.Args) != 2 {
return nil, errors.New("xpath: matches function must have two parameters")
}
var (
arg1, arg2 query
err error
)
if arg1, err = b.processNode(root.Args[0], flagsEnum.None, props); err != nil {
return nil, err
}
if arg2, err = b.processNode(root.Args[1], flagsEnum.None, props); err != nil {
return nil, err
}
// Issue #92, testing the regular expression before.
if q, ok := arg2.(*constantQuery); ok {
if _, err = getRegexp(q.Val.(string)); err != nil {
return nil, fmt.Errorf("matches() got error. %v", err)
}
}
qyOutput = &functionQuery{Func: matchesFunc(arg1, arg2)}
case "substring":
//substring( string , start [, length] )
if len(root.Args) < 2 {
return nil, errors.New("xpath: substring function must have at least two parameter")
}
var (
arg1, arg2, arg3 query
err error
)
if arg1, err = b.processNode(root.Args[0], flagsEnum.None, props); err != nil {
return nil, err
}
if arg2, err = b.processNode(root.Args[1], flagsEnum.None, props); err != nil {
return nil, err
}
if len(root.Args) == 3 {
if arg3, err = b.processNode(root.Args[2], flagsEnum.None, props); err != nil {
return nil, err
}
}
qyOutput = &functionQuery{Func: substringFunc(arg1, arg2, arg3)}
case "substring-before", "substring-after":
//substring-xxxx( haystack, needle )
if len(root.Args) != 2 {
return nil, errors.New("xpath: substring-before function must have two parameters")
}
var (
arg1, arg2 query
err error
)
if arg1, err = b.processNode(root.Args[0], flagsEnum.None, props); err != nil {
return nil, err
}
if arg2, err = b.processNode(root.Args[1], flagsEnum.None, props); err != nil {
return nil, err
}
qyOutput = &functionQuery{
Func: substringIndFunc(arg1, arg2, root.FuncName == "substring-after"),
}
case "string-length":
// string-length( [string] )
if len(root.Args) < 1 {
return nil, errors.New("xpath: string-length function must have at least one parameter")
}
arg1, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: stringLengthFunc(arg1)}
case "normalize-space":
var arg node
if len(root.Args) > 0 {
arg = root.Args[0]
} else {
arg = newAxisNode("self", allNode, "", "", "", nil)
}
arg1, err := b.processNode(arg, flagsEnum.None, props)
if err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: normalizespaceFunc(arg1)}
case "replace":
//replace( string , string, string )
if len(root.Args) != 3 {
return nil, errors.New("xpath: replace function must have three parameters")
}
var (
arg1, arg2, arg3 query
err error
)
if arg1, err = b.processNode(root.Args[0], flagsEnum.None, props); err != nil {
return nil, err
}
if arg2, err = b.processNode(root.Args[1], flagsEnum.None, props); err != nil {
return nil, err
}
if arg3, err = b.processNode(root.Args[2], flagsEnum.None, props); err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: replaceFunc(arg1, arg2, arg3)}
case "translate":
//translate( string , string, string )
if len(root.Args) != 3 {
return nil, errors.New("xpath: translate function must have three parameters")
}
var (
arg1, arg2, arg3 query
err error
)
if arg1, err = b.processNode(root.Args[0], flagsEnum.None, props); err != nil {
return nil, err
}
if arg2, err = b.processNode(root.Args[1], flagsEnum.None, props); err != nil {
return nil, err
}
if arg3, err = b.processNode(root.Args[2], flagsEnum.None, props); err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: translateFunc(arg1, arg2, arg3)}
case "not":
if len(root.Args) == 0 {
return nil, errors.New("xpath: not function must have at least one parameter")
}
argQuery, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: notFunc(argQuery)}
case "name", "local-name", "namespace-uri":
if len(root.Args) > 1 {
return nil, fmt.Errorf("xpath: %s function must have at most one parameter", root.FuncName)
}
var (
arg query
err error
)
if len(root.Args) == 1 {
arg, err = b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
}
switch root.FuncName {
case "name":
qyOutput = &functionQuery{Func: nameFunc(arg)}
case "local-name":
qyOutput = &functionQuery{Func: localNameFunc(arg)}
case "namespace-uri":
qyOutput = &functionQuery{Func: namespaceFunc(arg)}
}
case "true", "false":
val := root.FuncName == "true"
qyOutput = &functionQuery{
Func: func(_ query, _ iterator) interface{} {
return val
},
}
case "last":
qyOutput = &functionQuery{Input: b.firstInput, Func: lastFunc()}
*props |= builderProps.HasLast
case "position":
qyOutput = &functionQuery{Input: b.firstInput, Func: positionFunc()}
*props |= builderProps.HasPosition
case "boolean", "number", "string":
var inp query
if len(root.Args) > 1 {
return nil, fmt.Errorf("xpath: %s function must have at most one parameter", root.FuncName)
}
if len(root.Args) == 1 {
argQuery, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
inp = argQuery
}
switch root.FuncName {
case "boolean":
qyOutput = &functionQuery{Func: booleanFunc(inp)}
case "string":
qyOutput = &functionQuery{Func: stringFunc(inp)}
case "number":
qyOutput = &functionQuery{Func: numberFunc(inp)}
}
case "count":
if len(root.Args) == 0 {
return nil, fmt.Errorf("xpath: count(node-sets) function must with have parameters node-sets")
}
argQuery, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: countFunc(argQuery)}
case "sum":
if len(root.Args) == 0 {
return nil, fmt.Errorf("xpath: sum(node-sets) function must with have parameters node-sets")
}
argQuery, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: sumFunc(argQuery)}
case "ceiling", "floor", "round":
if len(root.Args) == 0 {
return nil, fmt.Errorf("xpath: ceiling(node-sets) function must with have parameters node-sets")
}
argQuery, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
switch root.FuncName {
case "ceiling":
qyOutput = &functionQuery{Func: ceilingFunc(argQuery)}
case "floor":
qyOutput = &functionQuery{Func: floorFunc(argQuery)}
case "round":
qyOutput = &functionQuery{Func: roundFunc(argQuery)}
}
case "concat":
if len(root.Args) < 2 {
return nil, fmt.Errorf("xpath: concat() must have at least two arguments")
}
var args []query
for _, v := range root.Args {
q, err := b.processNode(v, flagsEnum.None, props)
if err != nil {
return nil, err
}
args = append(args, q)
}
qyOutput = &functionQuery{Func: concatFunc(args...)}
case "reverse":
if len(root.Args) == 0 {
return nil, fmt.Errorf("xpath: reverse(node-sets) function must with have parameters node-sets")
}
argQuery, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
qyOutput = &transformFunctionQuery{Input: argQuery, Func: reverseFunc}
case "string-join":
if len(root.Args) != 2 {
return nil, fmt.Errorf("xpath: string-join(node-sets, separator) function requires node-set and argument")
}
input, err := b.processNode(root.Args[0], flagsEnum.None, props)
if err != nil {
return nil, err
}
arg1, err := b.processNode(root.Args[1], flagsEnum.None, props)
if err != nil {
return nil, err
}
qyOutput = &functionQuery{Func: stringJoinFunc(input, arg1)}
default:
return nil, fmt.Errorf("not yet support this function %s()", root.FuncName)
}
return qyOutput, nil
}
func (b *builder) processOperator(root *operatorNode, props *builderProp) (query, error) {
var (
leftProp builderProp
rightProp builderProp
)
left, err := b.processNode(root.Left, flagsEnum.None, &leftProp)
if err != nil {
return nil, err
}
right, err := b.processNode(root.Right, flagsEnum.None, &rightProp)
if err != nil {
return nil, err
}
*props = leftProp | rightProp
var qyOutput query
switch root.Op {
case "+", "-", "*", "div", "mod": // Numeric operator
var exprFunc func(iterator, interface{}, interface{}) interface{}
switch root.Op {
case "+":
exprFunc = plusFunc
case "-":
exprFunc = minusFunc
case "*":
exprFunc = mulFunc
case "div":
exprFunc = divFunc
case "mod":
exprFunc = modFunc
}
qyOutput = &numericQuery{Left: left, Right: right, Do: exprFunc}
case "=", ">", ">=", "<", "<=", "!=":
var exprFunc func(iterator, interface{}, interface{}) interface{}
switch root.Op {
case "=":
exprFunc = eqFunc
case ">":
exprFunc = gtFunc
case ">=":
exprFunc = geFunc
case "<":
exprFunc = ltFunc
case "<=":
exprFunc = leFunc
case "!=":
exprFunc = neFunc
}
qyOutput = &logicalQuery{Left: left, Right: right, Do: exprFunc}
case "or", "and":
isOr := false
if root.Op == "or" {
isOr = true
}
qyOutput = &booleanQuery{Left: left, Right: right, IsOr: isOr}
case "|":
*props |= builderProps.NonFlat
qyOutput = &unionQuery{Left: left, Right: right}
}
return qyOutput, nil
}
func (b *builder) processNode(root node, flags flag, props *builderProp) (q query, err error) {
if b.parseDepth = b.parseDepth + 1; b.parseDepth > 1024 {
err = errors.New("the xpath expressions is too complex")
return
}
*props = builderProps.None
switch root.Type() {
case nodeConstantOperand:
n := root.(*operandNode)
q = &constantQuery{Val: n.Val}
case nodeRoot:
q = &absoluteQuery{}
case nodeAxis:
q, err = b.processAxis(root.(*axisNode), flags, props)
b.firstInput = q
case nodeFilter:
q, err = b.processFilter(root.(*filterNode), flags, props)
b.firstInput = q
case nodeFunction:
q, err = b.processFunction(root.(*functionNode), props)
case nodeOperator:
q, err = b.processOperator(root.(*operatorNode), props)
case nodeGroup:
q, err = b.processNode(root.(*groupNode).Input, flagsEnum.None, props)
if err != nil {
return
}
q = &groupQuery{Input: q}
if b.firstInput == nil {
b.firstInput = q
}
}
b.parseDepth--
return
}
// build builds a specified XPath expressions expr.
func build(expr string, namespaces map[string]string) (q query, err error) {
defer func() {
if e := recover(); e != nil {
switch x := e.(type) {
case string:
err = errors.New(x)
case error:
err = x
default:
err = errors.New("unknown panic")
}
}
}()
root := parse(expr, namespaces)
b := &builder{}
props := builderProps.None
return b.processNode(root, flagsEnum.None, &props)
}
+80
View File
@@ -0,0 +1,80 @@
package xpath
import (
"regexp"
"sync"
)
type loadFunc func(key interface{}) (interface{}, error)
const (
defaultCap = 65536
)
// The reason we're building a simple capacity-resetting loading cache (when capacity reached) instead of using
// something like github.com/hashicorp/golang-lru is primarily due to (not wanting to create) external dependency.
// Currently this library has 0 external dep (other than go sdk), and supports go 1.6, 1.9, and 1.10 (and later).
// Creating external lib dependencies (plus their transitive dependencies) would make things hard if not impossible.
// We expect under most circumstances, the defaultCap is big enough for any long running services that use this
// library if their xpath regexp cardinality is low. However, in extreme cases when the capacity is reached, we
// simply reset the cache, taking a small subsequent perf hit (next to nothing considering amortization) in trade
// of more complex and less performant LRU type of construct.
type loadingCache struct {
sync.RWMutex
cap int
load loadFunc
m map[interface{}]interface{}
reset int
}
// NewLoadingCache creates a new instance of a loading cache with capacity. Capacity must be >= 0, or
// it will panic. Capacity == 0 means the cache growth is unbounded.
func NewLoadingCache(load loadFunc, capacity int) *loadingCache {
if capacity < 0 {
panic("capacity must be >= 0")
}
return &loadingCache{cap: capacity, load: load, m: make(map[interface{}]interface{})}
}
func (c *loadingCache) get(key interface{}) (interface{}, error) {
c.RLock()
v, found := c.m[key]
c.RUnlock()
if found {
return v, nil
}
v, err := c.load(key)
if err != nil {
return nil, err
}
c.Lock()
if c.cap > 0 && len(c.m) >= c.cap {
c.m = map[interface{}]interface{}{key: v}
c.reset++
} else {
c.m[key] = v
}
c.Unlock()
return v, nil
}
var (
// RegexpCache is a loading cache for string -> *regexp.Regexp mapping. It is exported so that in rare cases
// client can customize load func and/or capacity.
RegexpCache = defaultRegexpCache()
)
func defaultRegexpCache() *loadingCache {
return NewLoadingCache(
func(key interface{}) (interface{}, error) {
return regexp.Compile(key.(string))
}, defaultCap)
}
func getRegexp(pattern string) (*regexp.Regexp, error) {
exp, err := RegexpCache.get(pattern)
if err != nil {
return nil, err
}
return exp.(*regexp.Regexp), nil
}
+679
View File
@@ -0,0 +1,679 @@
package xpath
import (
"errors"
"fmt"
"math"
"strconv"
"strings"
"sync"
"unicode"
)
// Defined an interface of stringBuilder that compatible with
// strings.Builder(go 1.10) and bytes.Buffer(< go 1.10)
type stringBuilder interface {
WriteRune(r rune) (n int, err error)
WriteString(s string) (int, error)
Reset()
Grow(n int)
String() string
}
var builderPool = sync.Pool{New: func() interface{} {
return newStringBuilder()
}}
// The XPath function list.
func predicate(q query) func(NodeNavigator) bool {
type Predicater interface {
Test(NodeNavigator) bool
}
if p, ok := q.(Predicater); ok {
return p.Test
}
return func(NodeNavigator) bool { return true }
}
// positionFunc is a XPath Node Set functions position().
func positionFunc() func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
var (
count = 1
node = t.Current().Copy()
)
test := predicate(q)
for node.MoveToPrevious() {
if test(node) {
count++
}
}
return float64(count)
}
}
// lastFunc is a XPath Node Set functions last().
func lastFunc() func(query, iterator) interface{} {
return func(q query, t iterator) interface{} {
var (
count = 0
node = t.Current().Copy()
)
test := predicate(q)
node.MoveToFirst()
for {
if test(node) {
count++
}
if !node.MoveToNext() {
break
}
}
return float64(count)
}
}
// countFunc is a XPath Node Set functions count(node-set).
func countFunc(arg query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var count = 0
q := functionArgs(arg)
test := predicate(q)
switch typ := q.Evaluate(t).(type) {
case query:
for node := typ.Select(t); node != nil; node = typ.Select(t) {
if test(node) {
count++
}
}
}
return float64(count)
}
}
// sumFunc is a XPath Node Set functions sum(node-set).
func sumFunc(arg query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var sum float64
switch typ := functionArgs(arg).Evaluate(t).(type) {
case query:
for node := typ.Select(t); node != nil; node = typ.Select(t) {
if v, err := strconv.ParseFloat(node.Value(), 64); err == nil {
sum += v
}
}
case float64:
sum = typ
case string:
v, err := strconv.ParseFloat(typ, 64)
if err != nil {
panic(errors.New("sum() function argument type must be a node-set or number"))
}
sum = v
}
return sum
}
}
func asNumber(t iterator, o interface{}) float64 {
switch typ := o.(type) {
case query:
node := typ.Select(t)
if node == nil {
return math.NaN()
}
if v, err := strconv.ParseFloat(node.Value(), 64); err == nil {
return v
}
case float64:
return typ
case string:
v, err := strconv.ParseFloat(typ, 64)
if err == nil {
return v
}
}
return math.NaN()
}
// ceilingFunc is a XPath Node Set functions ceiling(node-set).
func ceilingFunc(arg query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
val := asNumber(t, functionArgs(arg).Evaluate(t))
// if math.IsNaN(val) {
// panic(errors.New("ceiling() function argument type must be a valid number"))
// }
return math.Ceil(val)
}
}
// floorFunc is a XPath Node Set functions floor(node-set).
func floorFunc(arg query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
val := asNumber(t, functionArgs(arg).Evaluate(t))
return math.Floor(val)
}
}
// roundFunc is a XPath Node Set functions round(node-set).
func roundFunc(arg query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
val := asNumber(t, functionArgs(arg).Evaluate(t))
//return math.Round(val)
return round(val)
}
}
// nameFunc is a XPath functions name([node-set]).
func nameFunc(arg query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var v NodeNavigator
if arg == nil {
v = t.Current()
} else {
v = arg.Clone().Select(t)
if v == nil {
return ""
}
}
ns := v.Prefix()
if ns == "" {
return v.LocalName()
}
return ns + ":" + v.LocalName()
}
}
// localNameFunc is a XPath functions local-name([node-set]).
func localNameFunc(arg query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var v NodeNavigator
if arg == nil {
v = t.Current()
} else {
v = arg.Clone().Select(t)
if v == nil {
return ""
}
}
return v.LocalName()
}
}
// namespaceFunc is a XPath functions namespace-uri([node-set]).
func namespaceFunc(arg query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var v NodeNavigator
if arg == nil {
v = t.Current()
} else {
// Get the first node in the node-set if specified.
v = arg.Clone().Select(t)
if v == nil {
return ""
}
}
// fix about namespace-uri() bug: https://github.com/antchfx/xmlquery/issues/22
// TODO: In the next version, add NamespaceURL() to the NodeNavigator interface.
type namespaceURL interface {
NamespaceURL() string
}
if f, ok := v.(namespaceURL); ok {
return f.NamespaceURL()
}
return v.Prefix()
}
}
func asBool(t iterator, v interface{}) bool {
switch v := v.(type) {
case nil:
return false
case *NodeIterator:
return v.MoveNext()
case bool:
return v
case float64:
return v != 0
case string:
return v != ""
case query:
return v.Select(t) != nil
default:
panic(fmt.Errorf("unexpected type: %T", v))
}
}
func asString(t iterator, v interface{}) string {
switch v := v.(type) {
case nil:
return ""
case bool:
if v {
return "true"
}
return "false"
case float64:
return strconv.FormatFloat(v, 'g', -1, 64)
case string:
return v
case query:
node := v.Select(t)
if node == nil {
return ""
}
return node.Value()
default:
panic(fmt.Errorf("unexpected type: %T", v))
}
}
// booleanFunc is a XPath functions boolean([node-set]).
func booleanFunc(arg1 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
v := functionArgs(arg1).Evaluate(t)
return asBool(t, v)
}
}
// numberFunc is a XPath functions number([node-set]).
func numberFunc(arg1 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
v := functionArgs(arg1).Evaluate(t)
return asNumber(t, v)
}
}
// stringFunc is a XPath functions string([node-set]).
func stringFunc(arg1 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
v := functionArgs(arg1).Evaluate(t)
return asString(t, v)
}
}
// startwithFunc is a XPath functions starts-with(string, string).
func startwithFunc(arg1, arg2 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var (
m, n string
ok bool
)
switch typ := functionArgs(arg1).Evaluate(t).(type) {
case string:
m = typ
case query:
node := typ.Select(t)
if node == nil {
return false
}
m = node.Value()
default:
panic(errors.New("starts-with() function argument type must be string"))
}
n, ok = functionArgs(arg2).Evaluate(t).(string)
if !ok {
panic(errors.New("starts-with() function argument type must be string"))
}
return strings.HasPrefix(m, n)
}
}
// endwithFunc is a XPath functions ends-with(string, string).
func endwithFunc(arg1, arg2 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var (
m, n string
ok bool
)
switch typ := functionArgs(arg1).Evaluate(t).(type) {
case string:
m = typ
case query:
node := typ.Select(t)
if node == nil {
return false
}
m = node.Value()
default:
panic(errors.New("ends-with() function argument type must be string"))
}
n, ok = functionArgs(arg2).Evaluate(t).(string)
if !ok {
panic(errors.New("ends-with() function argument type must be string"))
}
return strings.HasSuffix(m, n)
}
}
// containsFunc is a XPath functions contains(string or @attr, string).
func containsFunc(arg1, arg2 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var (
m, n string
ok bool
)
switch typ := functionArgs(arg1).Evaluate(t).(type) {
case string:
m = typ
case query:
node := typ.Select(t)
if node == nil {
return false
}
m = node.Value()
default:
panic(errors.New("contains() function argument type must be string"))
}
n, ok = functionArgs(arg2).Evaluate(t).(string)
if !ok {
panic(errors.New("contains() function argument type must be string"))
}
return strings.Contains(m, n)
}
}
// matchesFunc is an XPath function that tests a given string against a regexp pattern.
// Note: does not support https://www.w3.org/TR/xpath-functions-31/#func-matches 3rd optional `flags` argument; if
// needed, directly put flags in the regexp pattern, such as `(?i)^pattern$` for `i` flag.
func matchesFunc(arg1, arg2 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var s string
switch typ := functionArgs(arg1).Evaluate(t).(type) {
case string:
s = typ
case query:
node := typ.Select(t)
if node == nil {
return ""
}
s = node.Value()
}
var pattern string
var ok bool
if pattern, ok = functionArgs(arg2).Evaluate(t).(string); !ok {
panic(errors.New("matches() function second argument type must be string"))
}
re, err := getRegexp(pattern)
if err != nil {
panic(fmt.Errorf("matches() function second argument is not a valid regexp pattern, err: %s", err.Error()))
}
return re.MatchString(s)
}
}
// normalizespaceFunc is XPath functions normalize-space(string?)
func normalizespaceFunc(arg1 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var m string
switch typ := functionArgs(arg1).Evaluate(t).(type) {
case string:
m = typ
case query:
node := typ.Select(t)
if node == nil {
return ""
}
m = node.Value()
}
var b = builderPool.Get().(stringBuilder)
b.Grow(len(m))
runeStr := []rune(strings.TrimSpace(m))
l := len(runeStr)
for i := range runeStr {
r := runeStr[i]
isSpace := unicode.IsSpace(r)
if !(isSpace && (i+1 < l && unicode.IsSpace(runeStr[i+1]))) {
if isSpace {
r = ' '
}
b.WriteRune(r)
}
}
result := b.String()
b.Reset()
builderPool.Put(b)
return result
}
}
// substringFunc is XPath functions substring function returns a part of a given string.
func substringFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var m string
switch typ := functionArgs(arg1).Evaluate(t).(type) {
case string:
m = typ
case query:
node := typ.Select(t)
if node == nil {
return ""
}
m = node.Value()
}
var start, length float64
var ok bool
if start, ok = functionArgs(arg2).Evaluate(t).(float64); !ok {
panic(errors.New("substring() function first argument type must be int"))
} else if start < 1 {
panic(errors.New("substring() function first argument type must be >= 1"))
}
start--
if arg3 != nil {
if length, ok = functionArgs(arg3).Evaluate(t).(float64); !ok {
panic(errors.New("substring() function second argument type must be int"))
}
}
if (len(m) - int(start)) < int(length) {
panic(errors.New("substring() function start and length argument out of range"))
}
if length > 0 {
return m[int(start):int(length+start)]
}
return m[int(start):]
}
}
// substringIndFunc is XPath functions substring-before/substring-after function returns a part of a given string.
func substringIndFunc(arg1, arg2 query, after bool) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var str string
switch v := functionArgs(arg1).Evaluate(t).(type) {
case string:
str = v
case query:
node := v.Select(t)
if node == nil {
return ""
}
str = node.Value()
}
var word string
switch v := functionArgs(arg2).Evaluate(t).(type) {
case string:
word = v
case query:
node := v.Select(t)
if node == nil {
return ""
}
word = node.Value()
}
if word == "" {
return ""
}
i := strings.Index(str, word)
if i < 0 {
return ""
}
if after {
return str[i+len(word):]
}
return str[:i]
}
}
// stringLengthFunc is XPATH string-length( [string] ) function that returns a number
// equal to the number of characters in a given string.
func stringLengthFunc(arg1 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
switch v := functionArgs(arg1).Evaluate(t).(type) {
case string:
return float64(len(v))
case query:
node := v.Select(t)
if node == nil {
break
}
return float64(len(node.Value()))
}
return float64(0)
}
}
// translateFunc is XPath functions translate() function returns a replaced string.
func translateFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
str := asString(t, functionArgs(arg1).Evaluate(t))
src := asString(t, functionArgs(arg2).Evaluate(t))
dst := asString(t, functionArgs(arg3).Evaluate(t))
replace := make([]string, 0, len(src))
for i, s := range src {
d := ""
if i < len(dst) {
d = string(dst[i])
}
replace = append(replace, string(s), d)
}
return strings.NewReplacer(replace...).Replace(str)
}
}
// replaceFunc is XPath functions replace() function returns a replaced string.
func replaceFunc(arg1, arg2, arg3 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
str := asString(t, functionArgs(arg1).Evaluate(t))
src := asString(t, functionArgs(arg2).Evaluate(t))
dst := asString(t, functionArgs(arg3).Evaluate(t))
return strings.Replace(str, src, dst, -1)
}
}
// notFunc is XPATH functions not(expression) function operation.
func notFunc(arg1 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
switch v := functionArgs(arg1).Evaluate(t).(type) {
case bool:
return !v
case query:
node := v.Select(t)
return node == nil
default:
return false
}
}
}
// concatFunc is the concat function concatenates two or more
// strings and returns the resulting string.
// concat( string1 , string2 [, stringn]* )
func concatFunc(args ...query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
b := builderPool.Get().(stringBuilder)
for _, v := range args {
v = functionArgs(v)
switch v := v.Evaluate(t).(type) {
case string:
b.WriteString(v)
case query:
node := v.Select(t)
if node != nil {
b.WriteString(node.Value())
}
}
}
result := b.String()
b.Reset()
builderPool.Put(b)
return result
}
}
// https://github.com/antchfx/xpath/issues/43
func functionArgs(q query) query {
if _, ok := q.(*functionQuery); ok {
return q
}
return q.Clone()
}
func reverseFunc(q query, t iterator) func() NodeNavigator {
var list []NodeNavigator
for {
node := q.Select(t)
if node == nil {
break
}
list = append(list, node.Copy())
}
i := len(list)
return func() NodeNavigator {
if i <= 0 {
return nil
}
i--
node := list[i]
return node
}
}
// string-join is a XPath Node Set functions string-join(node-set, separator).
func stringJoinFunc(q, arg1 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
var separator string
switch v := functionArgs(arg1).Evaluate(t).(type) {
case string:
separator = v
case query:
node := v.Select(t)
if node != nil {
separator = node.Value()
}
}
q = functionArgs(q)
test := predicate(q)
var parts []string
switch v := q.Evaluate(t).(type) {
case string:
return v
case query:
for node := v.Select(t); node != nil; node = v.Select(t) {
if test(node) {
parts = append(parts, node.Value())
}
}
}
return strings.Join(parts, separator)
}
}
// lower-case is XPATH function that converts a string to lower case.
func lowerCaseFunc(arg1 query) func(query, iterator) interface{} {
return func(_ query, t iterator) interface{} {
v := functionArgs(arg1).Evaluate(t)
return strings.ToLower(asString(t, v))
}
}
+16
View File
@@ -0,0 +1,16 @@
// +build go1.10
package xpath
import (
"math"
"strings"
)
func round(f float64) int {
return int(math.Round(f))
}
func newStringBuilder() stringBuilder {
return &strings.Builder{}
}
+22
View File
@@ -0,0 +1,22 @@
// +build !go1.10
package xpath
import (
"bytes"
"math"
)
// math.Round() is supported by Go 1.10+,
// This method just compatible for version <1.10.
// https://github.com/golang/go/issues/20100
func round(f float64) int {
if math.Abs(f) < 0.5 {
return 0
}
return int(f + math.Copysign(0.5, f))
}
func newStringBuilder() stringBuilder {
return &bytes.Buffer{}
}
+288
View File
@@ -0,0 +1,288 @@
package xpath
import (
"strconv"
)
// The XPath number operator function list.
type logical func(iterator, string, interface{}, interface{}) bool
var logicalFuncs = [][]logical{
{cmpBooleanBoolean, nil, nil, nil},
{nil, cmpNumericNumeric, cmpNumericString, cmpNumericNodeSet},
{nil, cmpStringNumeric, cmpStringString, cmpStringNodeSet},
{nil, cmpNodeSetNumeric, cmpNodeSetString, cmpNodeSetNodeSet},
}
// number vs number
func cmpNumberNumberF(op string, a, b float64) bool {
switch op {
case "=":
return a == b
case ">":
return a > b
case "<":
return a < b
case ">=":
return a >= b
case "<=":
return a <= b
case "!=":
return a != b
}
return false
}
// string vs string
func cmpStringStringF(op string, a, b string) bool {
switch op {
case "=":
return a == b
case ">":
return a > b
case "<":
return a < b
case ">=":
return a >= b
case "<=":
return a <= b
case "!=":
return a != b
}
return false
}
func cmpBooleanBooleanF(op string, a, b bool) bool {
switch op {
case "or":
return a || b
case "and":
return a && b
}
return false
}
func cmpNumericNumeric(t iterator, op string, m, n interface{}) bool {
a := m.(float64)
b := n.(float64)
return cmpNumberNumberF(op, a, b)
}
func cmpNumericString(t iterator, op string, m, n interface{}) bool {
a := m.(float64)
b := n.(string)
num, err := strconv.ParseFloat(b, 64)
if err != nil {
panic(err)
}
return cmpNumberNumberF(op, a, num)
}
func cmpNumericNodeSet(t iterator, op string, m, n interface{}) bool {
a := m.(float64)
b := n.(query)
for {
node := b.Select(t)
if node == nil {
break
}
num, err := strconv.ParseFloat(node.Value(), 64)
if err != nil {
panic(err)
}
if cmpNumberNumberF(op, a, num) {
return true
}
}
return false
}
func cmpNodeSetNumeric(t iterator, op string, m, n interface{}) bool {
a := m.(query)
b := n.(float64)
for {
node := a.Select(t)
if node == nil {
break
}
num, err := strconv.ParseFloat(node.Value(), 64)
if err != nil {
panic(err)
}
if cmpNumberNumberF(op, num, b) {
return true
}
}
return false
}
func cmpNodeSetString(t iterator, op string, m, n interface{}) bool {
a := m.(query)
b := n.(string)
for {
node := a.Select(t)
if node == nil {
break
}
if cmpStringStringF(op, b, node.Value()) {
return true
}
}
return false
}
func cmpNodeSetNodeSet(t iterator, op string, m, n interface{}) bool {
a := m.(query)
b := n.(query)
for {
x := a.Select(t)
if x == nil {
return false
}
y := b.Select(t)
if y == nil {
return false
}
for {
if cmpStringStringF(op, x.Value(), y.Value()) {
return true
}
if y = b.Select(t); y == nil {
break
}
}
// reset
b.Evaluate(t)
}
}
func cmpStringNumeric(t iterator, op string, m, n interface{}) bool {
a := m.(string)
b := n.(float64)
num, err := strconv.ParseFloat(a, 64)
if err != nil {
panic(err)
}
return cmpNumberNumberF(op, b, num)
}
func cmpStringString(t iterator, op string, m, n interface{}) bool {
a := m.(string)
b := n.(string)
return cmpStringStringF(op, a, b)
}
func cmpStringNodeSet(t iterator, op string, m, n interface{}) bool {
a := m.(string)
b := n.(query)
for {
node := b.Select(t)
if node == nil {
break
}
if cmpStringStringF(op, a, node.Value()) {
return true
}
}
return false
}
func cmpBooleanBoolean(t iterator, op string, m, n interface{}) bool {
a := m.(bool)
b := n.(bool)
return cmpBooleanBooleanF(op, a, b)
}
// eqFunc is an `=` operator.
func eqFunc(t iterator, m, n interface{}) interface{} {
t1 := getXPathType(m)
t2 := getXPathType(n)
return logicalFuncs[t1][t2](t, "=", m, n)
}
// gtFunc is an `>` operator.
func gtFunc(t iterator, m, n interface{}) interface{} {
t1 := getXPathType(m)
t2 := getXPathType(n)
return logicalFuncs[t1][t2](t, ">", m, n)
}
// geFunc is an `>=` operator.
func geFunc(t iterator, m, n interface{}) interface{} {
t1 := getXPathType(m)
t2 := getXPathType(n)
return logicalFuncs[t1][t2](t, ">=", m, n)
}
// ltFunc is an `<` operator.
func ltFunc(t iterator, m, n interface{}) interface{} {
t1 := getXPathType(m)
t2 := getXPathType(n)
return logicalFuncs[t1][t2](t, "<", m, n)
}
// leFunc is an `<=` operator.
func leFunc(t iterator, m, n interface{}) interface{} {
t1 := getXPathType(m)
t2 := getXPathType(n)
return logicalFuncs[t1][t2](t, "<=", m, n)
}
// neFunc is an `!=` operator.
func neFunc(t iterator, m, n interface{}) interface{} {
t1 := getXPathType(m)
t2 := getXPathType(n)
return logicalFuncs[t1][t2](t, "!=", m, n)
}
// orFunc is an `or` operator.
var orFunc = func(t iterator, m, n interface{}) interface{} {
t1 := getXPathType(m)
t2 := getXPathType(n)
return logicalFuncs[t1][t2](t, "or", m, n)
}
func numericExpr(t iterator, m, n interface{}, cb func(float64, float64) float64) float64 {
a := asNumber(t, m)
b := asNumber(t, n)
return cb(a, b)
}
// plusFunc is an `+` operator.
var plusFunc = func(t iterator, m, n interface{}) interface{} {
return numericExpr(t, m, n, func(a, b float64) float64 {
return a + b
})
}
// minusFunc is an `-` operator.
var minusFunc = func(t iterator, m, n interface{}) interface{} {
return numericExpr(t, m, n, func(a, b float64) float64 {
return a - b
})
}
// mulFunc is an `*` operator.
var mulFunc = func(t iterator, m, n interface{}) interface{} {
return numericExpr(t, m, n, func(a, b float64) float64 {
return a * b
})
}
// divFunc is an `DIV` operator.
var divFunc = func(t iterator, m, n interface{}) interface{} {
return numericExpr(t, m, n, func(a, b float64) float64 {
return a / b
})
}
// modFunc is an 'MOD' operator.
var modFunc = func(t iterator, m, n interface{}) interface{} {
return numericExpr(t, m, n, func(a, b float64) float64 {
return float64(int(a) % int(b))
})
}
+1254
View File
File diff suppressed because it is too large Load Diff
+1437
View File
File diff suppressed because it is too large Load Diff
+176
View File
@@ -0,0 +1,176 @@
package xpath
import (
"errors"
"fmt"
)
// NodeType represents a type of XPath node.
type NodeType int
const (
// RootNode is a root node of the XML document or node tree.
RootNode NodeType = iota
// ElementNode is an element, such as <element>.
ElementNode
// AttributeNode is an attribute, such as id='123'.
AttributeNode
// TextNode is the text content of a node.
TextNode
// CommentNode is a comment node, such as <!-- my comment -->
CommentNode
// allNode is any types of node, used by xpath package only to predicate match.
allNode
)
// NodeNavigator provides cursor model for navigating XML data.
type NodeNavigator interface {
// NodeType returns the XPathNodeType of the current node.
NodeType() NodeType
// LocalName gets the Name of the current node.
LocalName() string
// Prefix returns namespace prefix associated with the current node.
Prefix() string
// Value gets the value of current node.
Value() string
// Copy does a deep copy of the NodeNavigator and all its components.
Copy() NodeNavigator
// MoveToRoot moves the NodeNavigator to the root node of the current node.
MoveToRoot()
// MoveToParent moves the NodeNavigator to the parent node of the current node.
MoveToParent() bool
// MoveToNextAttribute moves the NodeNavigator to the next attribute on current node.
MoveToNextAttribute() bool
// MoveToChild moves the NodeNavigator to the first child node of the current node.
MoveToChild() bool
// MoveToFirst moves the NodeNavigator to the first sibling node of the current node.
MoveToFirst() bool
// MoveToNext moves the NodeNavigator to the next sibling node of the current node.
MoveToNext() bool
// MoveToPrevious moves the NodeNavigator to the previous sibling node of the current node.
MoveToPrevious() bool
// MoveTo moves the NodeNavigator to the same position as the specified NodeNavigator.
MoveTo(NodeNavigator) bool
}
// NodeIterator holds all matched Node object.
type NodeIterator struct {
node NodeNavigator
query query
}
// Current returns current node which matched.
func (t *NodeIterator) Current() NodeNavigator {
return t.node
}
// MoveNext moves Navigator to the next match node.
func (t *NodeIterator) MoveNext() bool {
n := t.query.Select(t)
if n == nil {
return false
}
if !t.node.MoveTo(n) {
t.node = n.Copy()
}
return true
}
// Select selects a node set using the specified XPath expression.
// This method is deprecated, recommend using Expr.Select() method instead.
func Select(root NodeNavigator, expr string) *NodeIterator {
exp, err := Compile(expr)
if err != nil {
panic(err)
}
return exp.Select(root)
}
// Expr is an XPath expression for query.
type Expr struct {
s string
q query
}
type iteratorFunc func() NodeNavigator
func (f iteratorFunc) Current() NodeNavigator {
return f()
}
// Evaluate returns the result of the expression.
// The result type of the expression is one of the follow: bool,float64,string,NodeIterator).
func (expr *Expr) Evaluate(root NodeNavigator) interface{} {
val := expr.q.Evaluate(iteratorFunc(func() NodeNavigator { return root }))
switch val.(type) {
case query:
return &NodeIterator{query: expr.q.Clone(), node: root}
}
return val
}
// Select selects a node set using the specified XPath expression.
func (expr *Expr) Select(root NodeNavigator) *NodeIterator {
return &NodeIterator{query: expr.q.Clone(), node: root}
}
// String returns XPath expression string.
func (expr *Expr) String() string {
return expr.s
}
// Compile compiles an XPath expression string.
func Compile(expr string) (*Expr, error) {
if expr == "" {
return nil, errors.New("expr expression is nil")
}
qy, err := build(expr, nil)
if err != nil {
return nil, err
}
if qy == nil {
return nil, fmt.Errorf(fmt.Sprintf("undeclared variable in XPath expression: %s", expr))
}
return &Expr{s: expr, q: qy}, nil
}
// MustCompile compiles an XPath expression string and ignored error.
func MustCompile(expr string) *Expr {
exp, err := Compile(expr)
if err != nil {
return &Expr{s: expr, q: nopQuery{}}
}
return exp
}
// CompileWithNS compiles an XPath expression string, using given namespaces map.
func CompileWithNS(expr string, namespaces map[string]string) (*Expr, error) {
if expr == "" {
return nil, errors.New("expr expression is nil")
}
qy, err := build(expr, namespaces)
if err != nil {
return nil, err
}
if qy == nil {
return nil, fmt.Errorf(fmt.Sprintf("undeclared variable in XPath expression: %s", expr))
}
return &Expr{s: expr, q: qy}, nil
}