feat: Inital commit
This commit is contained in:
+218
@@ -0,0 +1,218 @@
|
||||
// Copyright 2018 Adam Tauber
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package colly
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"reflect"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// Unmarshal is a shorthand for colly.UnmarshalHTML
|
||||
func (h *HTMLElement) Unmarshal(v interface{}) error {
|
||||
return UnmarshalHTML(v, h.DOM, nil)
|
||||
}
|
||||
|
||||
// UnmarshalWithMap is a shorthand for colly.UnmarshalHTML, extended to allow maps to be passed in.
|
||||
func (h *HTMLElement) UnmarshalWithMap(v interface{}, structMap map[string]string) error {
|
||||
return UnmarshalHTML(v, h.DOM, structMap)
|
||||
}
|
||||
|
||||
// UnmarshalHTML declaratively extracts text or attributes to a struct from
|
||||
// HTML response using struct tags composed of css selectors.
|
||||
// Allowed struct tags:
|
||||
// - "selector" (required): CSS (goquery) selector of the desired data
|
||||
// - "attr" (optional): Selects the matching element's attribute's value.
|
||||
// Leave it blank or omit to get the text of the element.
|
||||
//
|
||||
// Example struct declaration:
|
||||
//
|
||||
// type Nested struct {
|
||||
// String string `selector:"div > p"`
|
||||
// Classes []string `selector:"li" attr:"class"`
|
||||
// Struct *Nested `selector:"div > div"`
|
||||
// }
|
||||
//
|
||||
// Supported types: struct, *struct, string, []string
|
||||
func UnmarshalHTML(v interface{}, s *goquery.Selection, structMap map[string]string) error {
|
||||
rv := reflect.ValueOf(v)
|
||||
|
||||
if rv.Kind() != reflect.Ptr || rv.IsNil() {
|
||||
return errors.New("Invalid type or nil-pointer")
|
||||
}
|
||||
|
||||
sv := rv.Elem()
|
||||
st := reflect.TypeOf(v).Elem()
|
||||
if structMap != nil {
|
||||
for k, v := range structMap {
|
||||
attrV := sv.FieldByName(k)
|
||||
if !attrV.CanAddr() || !attrV.CanSet() {
|
||||
continue
|
||||
}
|
||||
if err := unmarshalSelector(s, attrV, v); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for i := 0; i < sv.NumField(); i++ {
|
||||
attrV := sv.Field(i)
|
||||
if !attrV.CanAddr() || !attrV.CanSet() {
|
||||
continue
|
||||
}
|
||||
if err := unmarshalAttr(s, attrV, st.Field(i)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func unmarshalSelector(s *goquery.Selection, attrV reflect.Value, selector string) error {
|
||||
//selector is "-" specify that field should ignore.
|
||||
if selector == "-" {
|
||||
return nil
|
||||
}
|
||||
htmlAttr := ""
|
||||
// TODO support more types
|
||||
switch attrV.Kind() {
|
||||
case reflect.Slice:
|
||||
if err := unmarshalSlice(s, selector, htmlAttr, attrV); err != nil {
|
||||
return err
|
||||
}
|
||||
case reflect.String:
|
||||
val := getDOMValue(s.Find(selector), htmlAttr)
|
||||
attrV.Set(reflect.Indirect(reflect.ValueOf(val)))
|
||||
case reflect.Struct:
|
||||
if err := unmarshalStruct(s, selector, attrV); err != nil {
|
||||
return err
|
||||
}
|
||||
case reflect.Ptr:
|
||||
if err := unmarshalPtr(s, selector, attrV); err != nil {
|
||||
return err
|
||||
}
|
||||
default:
|
||||
return errors.New("Invalid type: " + attrV.String())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func unmarshalAttr(s *goquery.Selection, attrV reflect.Value, attrT reflect.StructField) error {
|
||||
selector := attrT.Tag.Get("selector")
|
||||
//selector is "-" specify that field should ignore.
|
||||
if selector == "-" {
|
||||
return nil
|
||||
}
|
||||
htmlAttr := attrT.Tag.Get("attr")
|
||||
// TODO support more types
|
||||
switch attrV.Kind() {
|
||||
case reflect.Slice:
|
||||
if err := unmarshalSlice(s, selector, htmlAttr, attrV); err != nil {
|
||||
return err
|
||||
}
|
||||
case reflect.String:
|
||||
val := getDOMValue(s.Find(selector), htmlAttr)
|
||||
attrV.Set(reflect.Indirect(reflect.ValueOf(val)))
|
||||
case reflect.Struct:
|
||||
if err := unmarshalStruct(s, selector, attrV); err != nil {
|
||||
return err
|
||||
}
|
||||
case reflect.Ptr:
|
||||
if err := unmarshalPtr(s, selector, attrV); err != nil {
|
||||
return err
|
||||
}
|
||||
default:
|
||||
return errors.New("Invalid type: " + attrV.String())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func unmarshalStruct(s *goquery.Selection, selector string, attrV reflect.Value) error {
|
||||
newS := s
|
||||
if selector != "" {
|
||||
newS = newS.Find(selector)
|
||||
}
|
||||
if newS.Nodes == nil {
|
||||
return nil
|
||||
}
|
||||
v := reflect.New(attrV.Type())
|
||||
err := UnmarshalHTML(v.Interface(), newS, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
attrV.Set(reflect.Indirect(v))
|
||||
return nil
|
||||
}
|
||||
|
||||
func unmarshalPtr(s *goquery.Selection, selector string, attrV reflect.Value) error {
|
||||
newS := s
|
||||
if selector != "" {
|
||||
newS = newS.Find(selector)
|
||||
}
|
||||
if newS.Nodes == nil {
|
||||
return nil
|
||||
}
|
||||
e := attrV.Type().Elem()
|
||||
if e.Kind() != reflect.Struct {
|
||||
return errors.New("Invalid slice type")
|
||||
}
|
||||
v := reflect.New(e)
|
||||
err := UnmarshalHTML(v.Interface(), newS, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
attrV.Set(v)
|
||||
return nil
|
||||
}
|
||||
|
||||
func unmarshalSlice(s *goquery.Selection, selector, htmlAttr string, attrV reflect.Value) error {
|
||||
if attrV.Pointer() == 0 {
|
||||
v := reflect.MakeSlice(attrV.Type(), 0, 0)
|
||||
attrV.Set(v)
|
||||
}
|
||||
switch attrV.Type().Elem().Kind() {
|
||||
case reflect.String:
|
||||
s.Find(selector).Each(func(_ int, s *goquery.Selection) {
|
||||
val := getDOMValue(s, htmlAttr)
|
||||
attrV.Set(reflect.Append(attrV, reflect.Indirect(reflect.ValueOf(val))))
|
||||
})
|
||||
case reflect.Ptr:
|
||||
s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) {
|
||||
someVal := reflect.New(attrV.Type().Elem().Elem())
|
||||
UnmarshalHTML(someVal.Interface(), innerSel, nil)
|
||||
attrV.Set(reflect.Append(attrV, someVal))
|
||||
})
|
||||
case reflect.Struct:
|
||||
s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) {
|
||||
someVal := reflect.New(attrV.Type().Elem())
|
||||
UnmarshalHTML(someVal.Interface(), innerSel, nil)
|
||||
attrV.Set(reflect.Append(attrV, reflect.Indirect(someVal)))
|
||||
})
|
||||
default:
|
||||
return errors.New("Invalid slice type")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func getDOMValue(s *goquery.Selection, attr string) string {
|
||||
if attr == "" {
|
||||
return strings.TrimSpace(s.First().Text())
|
||||
}
|
||||
attrV, _ := s.Attr(attr)
|
||||
return attrV
|
||||
}
|
||||
Reference in New Issue
Block a user