feat: Inital commit

This commit is contained in:
2025-07-26 05:58:59 +00:00
commit 753d1c60ea
1849 changed files with 830533 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
comment: false
+43
View File
@@ -0,0 +1,43 @@
# 2.1.0 - 2020.06.09
- HTTP tracing support
- New callback: OnResponseHeader
- Queue fixes
- New collector option: Collector.CheckHead
- Proxy fixes
- Fixed POST revisit checking
- Updated dependencies
# 2.0.0 - 2019.11.28
- Breaking change: Change Collector.RedirectHandler member to Collector.SetRedirectHandler function
- Go module support
- Collector.HasVisited method added to be able to check if an url has been visited
- Collector.SetClient method introduced
- HTMLElement.ChildTexts method added
- New user agents
- Multiple bugfixes
# 1.2.0 - 2019.02.13
- Compatibility with the latest htmlquery package
- New request shortcut for HEAD requests
- Check URL availibility before visiting
- Fix proxy URL value
- Request counter fix
- Minor fixes in examples
# 1.1.0 - 2018.08.13
- Appengine integration takes context.Context instead of http.Request (API change)
- Added "Accept" http header by default to every request
- Support slices of pointers in unmarshal
- Fixed a race condition in queues
- ForEachWithBreak method added to HTMLElement
- Added a local file example
- Support gzip decompression of response bodies
- Don't share waitgroup when cloning a collector
- Fixed instagram example
# 1.0.0 - 2018.05.13
+67
View File
@@ -0,0 +1,67 @@
# Contribute
## Introduction
First, thank you for considering contributing to colly! It's people like you that make the open source community such a great community! 😊
We welcome any type of contribution, not only code. You can help with
- **QA**: file bug reports, the more details you can give the better (e.g. screenshots with the console open)
- **Marketing**: writing blog posts, howto's, printing stickers, ...
- **Community**: presenting the project at meetups, organizing a dedicated meetup for the local community, ...
- **Code**: take a look at the [open issues](https://github.com/gocolly/colly/issues). Even if you can't write code, commenting on them, showing that you care about a given issue matters. It helps us triage them.
- **Money**: we welcome financial contributions in full transparency on our [open collective](https://opencollective.com/colly).
## Your First Contribution
Working on your first Pull Request? You can learn how from this *free* series, [How to Contribute to an Open Source Project on GitHub](https://app.egghead.io/playlists/how-to-contribute-to-an-open-source-project-on-github).
## Submitting code
Any code change should be submitted as a pull request. The description should explain what the code does and give steps to execute it. The pull request should also contain tests.
## Code review process
The bigger the pull request, the longer it will take to review and merge. Try to break down large pull requests in smaller chunks that are easier to review and merge.
It is also always helpful to have some context for your pull request. What was the purpose? Why does it matter to you?
## Financial contributions
We also welcome financial contributions in full transparency on our [open collective](https://opencollective.com/colly).
Anyone can file an expense. If the expense makes sense for the development of the community, it will be "merged" in the ledger of our open collective by the core contributors and the person who filed the expense will be reimbursed.
## Questions
If you have any questions, create an [issue](https://github.com/gocolly/colly/issues/new) (protip: do a quick search first to see if someone else didn't ask the same question before!).
You can also reach us at hello@colly.opencollective.com.
## Credits
### Contributors
Thank you to all the people who have already contributed to colly!
<a href="graphs/contributors"><img src="https://opencollective.com/colly/contributors.svg?width=890" /></a>
### Backers
Thank you to all our backers! [[Become a backer](https://opencollective.com/colly#backer)]
<a href="https://opencollective.com/colly#backers" target="_blank"><img src="https://opencollective.com/colly/backers.svg?width=890"></a>
### Sponsors
Thank you to all our sponsors! (please ask your company to also support this open source project by [becoming a sponsor](https://opencollective.com/colly#sponsor))
<a href="https://opencollective.com/colly/sponsor/0/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/0/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/1/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/1/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/2/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/2/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/3/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/3/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/4/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/4/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/5/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/5/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/6/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/6/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/7/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/7/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/8/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/8/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/9/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/9/avatar.svg"></a>
<!-- This `CONTRIBUTING.md` is based on @nayafia's template https://github.com/nayafia/contributing-template -->
+202
View File
@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
+142
View File
@@ -0,0 +1,142 @@
# Colly
Lightning Fast and Elegant Scraping Framework for Gophers
Colly provides a clean interface to write any kind of crawler/scraper/spider.
With Colly you can easily extract structured data from websites, which can be used for a wide range of applications, like data mining, data processing or archiving.
[![GoDoc](https://godoc.org/github.com/gocolly/colly?status.svg)](https://pkg.go.dev/github.com/gocolly/colly/v2)
[![Backers on Open Collective](https://opencollective.com/colly/backers/badge.svg)](#backers) [![Sponsors on Open Collective](https://opencollective.com/colly/sponsors/badge.svg)](#sponsors) [![build status](https://github.com/gocolly/colly/actions/workflows/ci.yml/badge.svg)](https://github.com/gocolly/colly/actions/workflows/ci.yml)
[![report card](https://img.shields.io/badge/report%20card-a%2B-ff3333.svg?style=flat-square)](http://goreportcard.com/report/gocolly/colly)
[![view examples](https://img.shields.io/badge/learn%20by-examples-0077b3.svg?style=flat-square)](https://github.com/gocolly/colly/tree/master/_examples)
[![Code Coverage](https://img.shields.io/codecov/c/github/gocolly/colly/master.svg)](https://codecov.io/github/gocolly/colly?branch=master)
[![FOSSA Status](https://app.fossa.io/api/projects/git%2Bgithub.com%2Fgocolly%2Fcolly.svg?type=shield)](https://app.fossa.io/projects/git%2Bgithub.com%2Fgocolly%2Fcolly?ref=badge_shield)
[![Twitter URL](https://img.shields.io/badge/twitter-follow-green.svg)](https://twitter.com/gocolly)
------
## Sponsors
<a href="https://scrapfly.io/?utm_source=Github&utm_medium=repo&utm_campaign=colly" target="_blank"><img src="assets/scrapfly.png" alt="Scrapfly.io" width="149"></a>
[Scrapfly](https://scrapfly.io/?utm_source=Github&utm_medium=repo&utm_campaign=colly)
is an enterprise-grade solution providing Web Scraping API that aims to simplify the
scraping process by managing everything: real browser rendering, rotating proxies, and
fingerprints (TLS, HTTP, browser) to bypass all major anti-bots. Scrapfly also unlocks the
observability by providing an analytical dashboard and measuring the success rate/block
rate in detail.
------
## Features
- Clean API
- Fast (>1k request/sec on a single core)
- Manages request delays and maximum concurrency per domain
- Automatic cookie and session handling
- Sync/async/parallel scraping
- Caching
- Automatic encoding of non-unicode responses
- Robots.txt support
- Distributed scraping
- Configuration via environment variables
- Extensions
## Example
```go
func main() {
c := colly.NewCollector()
// Find and visit all links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.Visit("http://go-colly.org/")
}
```
See [examples folder](https://github.com/gocolly/colly/tree/master/_examples) for more detailed examples.
## Installation
Add colly to your `go.mod` file:
```
module github.com/x/y
go 1.14
require (
github.com/gocolly/colly/v2 latest
)
```
## Bugs
Bugs or suggestions? Visit the [issue tracker](https://github.com/gocolly/colly/issues) or join `#colly` on freenode
## Other Projects Using Colly
Below is a list of public, open source projects that use Colly:
- [greenpeace/check-my-pages](https://github.com/greenpeace/check-my-pages) Scraping script to test the Spanish Greenpeace web archive.
- [altsab/gowap](https://github.com/altsab/gowap) Wappalyzer implementation in Go.
- [jesuiscamille/goquotes](https://github.com/jesuiscamille/goquotes) A quotes scraper, making your day a little better!
- [jivesearch/jivesearch](https://github.com/jivesearch/jivesearch) A search engine that doesn't track you.
- [Leagify/colly-draft-prospects](https://github.com/Leagify/colly-draft-prospects) A scraper for future NFL Draft prospects.
- [lucasepe/go-ps4](https://github.com/lucasepe/go-ps4) Search playstation store for your favorite PS4 games using the command line.
- [yringler/inside-chassidus-scraper](https://github.com/yringler/inside-chassidus-scraper) Scrapes Rabbi Paltiel's web site for lesson metadata.
- [gamedb/gamedb](https://github.com/gamedb/gamedb) A database of Steam games.
- [lawzava/scrape](https://github.com/lawzava/scrape) CLI for email scraping from any website.
- [eureka101v/WeiboSpiderGo](https://github.com/eureka101v/WeiboSpiderGo) A sina weibo(chinese twitter) scraper
- [Go-phie/gophie](https://github.com/Go-phie/gophie) Search, Download and Stream movies from your terminal
- [imthaghost/goclone](https://github.com/imthaghost/goclone) Clone websites to your computer within seconds.
- [superiss/spidy](https://github.com/superiss/spidy) Crawl the web and collect expired domains.
- [docker-slim/docker-slim](https://github.com/docker-slim/docker-slim) Optimize your Docker containers to make them smaller and better.
- [seversky/gachifinder](https://github.com/seversky/gachifinder) an agent for asynchronous scraping, parsing and writing to some storages(elasticsearch for now)
- [eval-exec/goodreads](https://github.com/eval-exec/goodreads) crawl all tags and all pages of quotes from goodreads.
If you are using Colly in a project please send a pull request to add it to the list.
## Contributors
This project exists thanks to all the people who contribute. [[Contribute]](CONTRIBUTING.md).
<a href="https://github.com/gocolly/colly/graphs/contributors"><img src="https://opencollective.com/colly/contributors.svg?width=890" /></a>
## Backers
Thank you to all our backers! 🙏 [[Become a backer](https://opencollective.com/colly#backer)]
<a href="https://opencollective.com/colly#backers" target="_blank"><img src="https://opencollective.com/colly/backers.svg?width=890"></a>
## Sponsors
Support this project by becoming a sponsor. Your logo will show up here with a link to your website. [[Become a sponsor](https://opencollective.com/colly#sponsor)]
<a href="https://opencollective.com/colly/sponsor/0/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/0/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/1/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/1/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/2/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/2/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/3/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/3/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/4/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/4/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/5/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/5/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/6/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/6/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/7/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/7/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/8/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/8/avatar.svg"></a>
<a href="https://opencollective.com/colly/sponsor/9/website" target="_blank"><img src="https://opencollective.com/colly/sponsor/9/avatar.svg"></a>
## License
[![FOSSA Status](https://app.fossa.io/api/projects/git%2Bgithub.com%2Fgocolly%2Fcolly.svg?type=large)](https://app.fossa.io/projects/git%2Bgithub.com%2Fgocolly%2Fcolly?ref=badge_large)
+1
View File
@@ -0,0 +1 @@
2.1.0
+1583
View File
File diff suppressed because it is too large Load Diff
+87
View File
@@ -0,0 +1,87 @@
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"sync"
)
// Context provides a tiny layer for passing data between callbacks
type Context struct {
contextMap map[string]interface{}
lock *sync.RWMutex
}
// NewContext initializes a new Context instance
func NewContext() *Context {
return &Context{
contextMap: make(map[string]interface{}),
lock: &sync.RWMutex{},
}
}
// UnmarshalBinary decodes Context value to nil
// This function is used by request caching
func (c *Context) UnmarshalBinary(_ []byte) error {
return nil
}
// MarshalBinary encodes Context value
// This function is used by request caching
func (c *Context) MarshalBinary() (_ []byte, _ error) {
return nil, nil
}
// Put stores a value of any type in Context
func (c *Context) Put(key string, value interface{}) {
c.lock.Lock()
c.contextMap[key] = value
c.lock.Unlock()
}
// Get retrieves a string value from Context.
// Get returns an empty string if key not found
func (c *Context) Get(key string) string {
c.lock.RLock()
defer c.lock.RUnlock()
if v, ok := c.contextMap[key]; ok {
return v.(string)
}
return ""
}
// GetAny retrieves a value from Context.
// GetAny returns nil if key not found
func (c *Context) GetAny(key string) interface{} {
c.lock.RLock()
defer c.lock.RUnlock()
if v, ok := c.contextMap[key]; ok {
return v
}
return nil
}
// ForEach iterate context
func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{} {
c.lock.RLock()
defer c.lock.RUnlock()
ret := make([]interface{}, 0, len(c.contextMap))
for k, v := range c.contextMap {
ret = append(ret, fn(k, v))
}
return ret
}
+36
View File
@@ -0,0 +1,36 @@
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package debug
// Event represents an action inside a collector
type Event struct {
// Type is the type of the event
Type string
// RequestID identifies the HTTP request of the Event
RequestID uint32
// CollectorID identifies the collector of the Event
CollectorID uint32
// Values contains the event's key-value pairs. Different type of events
// can return different key-value pairs
Values map[string]string
}
// Debugger is an interface for different type of debugging backends
type Debugger interface {
// Init initializes the backend
Init() error
// Event receives a new collector event.
Event(e *Event)
}
+54
View File
@@ -0,0 +1,54 @@
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package debug
import (
"io"
"log"
"os"
"sync/atomic"
"time"
)
// LogDebugger is the simplest debugger which prints log messages to the STDERR
type LogDebugger struct {
// Output is the log destination, anything can be used which implements them
// io.Writer interface. Leave it blank to use STDERR
Output io.Writer
// Prefix appears at the beginning of each generated log line
Prefix string
// Flag defines the logging properties.
Flag int
logger *log.Logger
counter int32
start time.Time
}
// Init initializes the LogDebugger
func (l *LogDebugger) Init() error {
l.counter = 0
l.start = time.Now()
if l.Output == nil {
l.Output = os.Stderr
}
l.logger = log.New(l.Output, l.Prefix, l.Flag)
return nil
}
// Event receives Collector events and prints them to STDERR
func (l *LogDebugger) Event(e *Event) {
i := atomic.AddInt32(&l.counter, 1)
l.logger.Printf("[%06d] %d [%6d - %s] %q (%s)\n", i, e.CollectorID, e.RequestID, e.Type, e.Values, time.Since(l.start))
}
+153
View File
@@ -0,0 +1,153 @@
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package debug
import (
"encoding/json"
"log"
"net/http"
"sync"
"time"
)
// WebDebugger is a web based debuging frontend for colly
type WebDebugger struct {
// Address is the address of the web server. It is 127.0.0.1:7676 by default.
Address string
initialized bool
CurrentRequests map[uint32]requestInfo
RequestLog []requestInfo
sync.Mutex
}
type requestInfo struct {
URL string
Started time.Time
Duration time.Duration
ResponseStatus string
ID uint32
CollectorID uint32
}
// Init initializes the WebDebugger
func (w *WebDebugger) Init() error {
if w.initialized {
return nil
}
defer func() {
w.initialized = true
}()
if w.Address == "" {
w.Address = "127.0.0.1:7676"
}
w.RequestLog = make([]requestInfo, 0)
w.CurrentRequests = make(map[uint32]requestInfo)
http.HandleFunc("/", w.indexHandler)
http.HandleFunc("/status", w.statusHandler)
log.Println("Starting debug webserver on", w.Address)
go http.ListenAndServe(w.Address, nil)
return nil
}
// Event updates the debugger's status
func (w *WebDebugger) Event(e *Event) {
w.Lock()
defer w.Unlock()
switch e.Type {
case "request":
w.CurrentRequests[e.RequestID] = requestInfo{
URL: e.Values["url"],
Started: time.Now(),
ID: e.RequestID,
CollectorID: e.CollectorID,
}
case "response", "error":
r := w.CurrentRequests[e.RequestID]
r.Duration = time.Since(r.Started)
r.ResponseStatus = e.Values["status"]
w.RequestLog = append(w.RequestLog, r)
delete(w.CurrentRequests, e.RequestID)
}
}
func (w *WebDebugger) indexHandler(wr http.ResponseWriter, r *http.Request) {
wr.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>Colly Debugger WebUI</title>
<script src="https://code.jquery.com/jquery-latest.min.js" type="text/javascript"></script>
<link rel="stylesheet" type="text/css" href="https://semantic-ui.com/dist/semantic.min.css">
</head>
<body>
<div class="ui inverted vertical masthead center aligned segment" id="menu">
<div class="ui tiny secondary inverted menu">
<a class="item" href="/"><b>Colly WebDebugger</b></a>
</div>
</div>
<div class="ui grid container">
<div class="row">
<div class="eight wide column">
<h1>Current Requests <span id="current_request_count"></span></h1>
<div id="current_requests" class="ui small feed"></div>
</div>
<div class="eight wide column">
<h1>Finished Requests <span id="request_log_count"></span></h1>
<div id="request_log" class="ui small feed"></div>
</div>
</div>
</div>
<script>
function curRequestTpl(url, started, collectorId) {
return '<div class="event"><div class="content"><div class="summary">' + url + '</div><div class="meta">Collector #' + collectorId + ' - ' + started + "</div></div></div>";
}
function requestLogTpl(url, duration, collectorId) {
return '<div class="event"><div class="content"><div class="summary">' + url + '</div><div class="meta">Collector #' + collectorId + ' - ' + (duration/1000000000) + "s</div></div></div>";
}
function fetchStatus() {
$.getJSON("/status", function(data) {
$("#current_requests").html("");
$("#request_log").html("");
$("#current_request_count").text('(' + Object.keys(data.CurrentRequests).length + ')');
$("#request_log_count").text('(' + data.RequestLog.length + ')');
for(var i in data.CurrentRequests) {
var r = data.CurrentRequests[i];
$("#current_requests").append(curRequestTpl(r.URL, r.Started, r.CollectorID));
}
for(var i in data.RequestLog.reverse()) {
var r = data.RequestLog[i];
$("#request_log").append(requestLogTpl(r.URL, r.Duration, r.CollectorID));
}
setTimeout(fetchStatus, 1000);
});
}
$(document).ready(function() {
fetchStatus();
});
</script>
</body>
</html>
`))
}
func (w *WebDebugger) statusHandler(wr http.ResponseWriter, r *http.Request) {
w.Lock()
jsonData, err := json.MarshalIndent(w, "", " ")
w.Unlock()
if err != nil {
panic(err)
}
wr.Write(jsonData)
}
+131
View File
@@ -0,0 +1,131 @@
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"strings"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
)
// HTMLElement is the representation of a HTML tag.
type HTMLElement struct {
// Name is the name of the tag
Name string
Text string
attributes []html.Attribute
// Request is the request object of the element's HTML document
Request *Request
// Response is the Response object of the element's HTML document
Response *Response
// DOM is the goquery parsed DOM object of the page. DOM is relative
// to the current HTMLElement
DOM *goquery.Selection
// Index stores the position of the current element within all the elements matched by an OnHTML callback
Index int
}
// NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node.
func NewHTMLElementFromSelectionNode(resp *Response, s *goquery.Selection, n *html.Node, idx int) *HTMLElement {
return &HTMLElement{
Name: n.Data,
Request: resp.Request,
Response: resp,
Text: goquery.NewDocumentFromNode(n).Text(),
DOM: s,
Index: idx,
attributes: n.Attr,
}
}
// Attr returns the selected attribute of a HTMLElement or empty string
// if no attribute found
func (h *HTMLElement) Attr(k string) string {
for _, a := range h.attributes {
if a.Key == k {
return a.Val
}
}
return ""
}
// ChildText returns the concatenated and stripped text content of the matching
// elements.
func (h *HTMLElement) ChildText(goquerySelector string) string {
return strings.TrimSpace(h.DOM.Find(goquerySelector).Text())
}
// ChildTexts returns the stripped text content of all the matching
// elements.
func (h *HTMLElement) ChildTexts(goquerySelector string) []string {
var res []string
h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
res = append(res, strings.TrimSpace(s.Text()))
})
return res
}
// ChildAttr returns the stripped text content of the first matching
// element's attribute.
func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string {
if attr, ok := h.DOM.Find(goquerySelector).Attr(attrName); ok {
return strings.TrimSpace(attr)
}
return ""
}
// ChildAttrs returns the stripped text content of all the matching
// element's attributes.
func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string {
var res []string
h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
if attr, ok := s.Attr(attrName); ok {
res = append(res, strings.TrimSpace(attr))
}
})
return res
}
// ForEach iterates over the elements matched by the first argument
// and calls the callback function on every HTMLElement match.
func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement)) {
i := 0
h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
for _, n := range s.Nodes {
callback(i, NewHTMLElementFromSelectionNode(h.Response, s, n, i))
i++
}
})
}
// ForEachWithBreak iterates over the elements matched by the first argument
// and calls the callback function on every HTMLElement match.
// It is identical to ForEach except that it is possible to break
// out of the loop by returning false in the callback function. It returns the
// current Selection object.
func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool) {
i := 0
h.DOM.Find(goquerySelector).EachWithBreak(func(_ int, s *goquery.Selection) bool {
for _, n := range s.Nodes {
if callback(i, NewHTMLElementFromSelectionNode(h.Response, s, n, i)) {
i++
return true
}
}
return false
})
}
+239
View File
@@ -0,0 +1,239 @@
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"crypto/sha1"
"encoding/gob"
"encoding/hex"
"io"
"math/rand"
"net/http"
"os"
"path"
"regexp"
"strings"
"sync"
"time"
"compress/gzip"
"github.com/gobwas/glob"
)
type httpBackend struct {
LimitRules []*LimitRule
Client *http.Client
lock *sync.RWMutex
}
type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header) bool
// LimitRule provides connection restrictions for domains.
// Both DomainRegexp and DomainGlob can be used to specify
// the included domains patterns, but at least one is required.
// There can be two kind of limitations:
// - Parallelism: Set limit for the number of concurrent requests to matching domains
// - Delay: Wait specified amount of time between requests (parallelism is 1 in this case)
type LimitRule struct {
// DomainRegexp is a regular expression to match against domains
DomainRegexp string
// DomainGlob is a glob pattern to match against domains
DomainGlob string
// Delay is the duration to wait before creating a new request to the matching domains
Delay time.Duration
// RandomDelay is the extra randomized duration to wait added to Delay before creating a new request
RandomDelay time.Duration
// Parallelism is the number of the maximum allowed concurrent requests of the matching domains
Parallelism int
waitChan chan bool
compiledRegexp *regexp.Regexp
compiledGlob glob.Glob
}
// Init initializes the private members of LimitRule
func (r *LimitRule) Init() error {
waitChanSize := 1
if r.Parallelism > 1 {
waitChanSize = r.Parallelism
}
r.waitChan = make(chan bool, waitChanSize)
hasPattern := false
if r.DomainRegexp != "" {
c, err := regexp.Compile(r.DomainRegexp)
if err != nil {
return err
}
r.compiledRegexp = c
hasPattern = true
}
if r.DomainGlob != "" {
c, err := glob.Compile(r.DomainGlob)
if err != nil {
return err
}
r.compiledGlob = c
hasPattern = true
}
if !hasPattern {
return ErrNoPattern
}
return nil
}
func (h *httpBackend) Init(jar http.CookieJar) {
rand.Seed(time.Now().UnixNano())
h.Client = &http.Client{
Jar: jar,
Timeout: 10 * time.Second,
}
h.lock = &sync.RWMutex{}
}
// Match checks that the domain parameter triggers the rule
func (r *LimitRule) Match(domain string) bool {
match := false
if r.compiledRegexp != nil && r.compiledRegexp.MatchString(domain) {
match = true
}
if r.compiledGlob != nil && r.compiledGlob.Match(domain) {
match = true
}
return match
}
func (h *httpBackend) GetMatchingRule(domain string) *LimitRule {
if h.LimitRules == nil {
return nil
}
h.lock.RLock()
defer h.lock.RUnlock()
for _, r := range h.LimitRules {
if r.Match(domain) {
return r
}
}
return nil
}
func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc, cacheDir string) (*Response, error) {
if cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" {
return h.Do(request, bodySize, checkHeadersFunc)
}
sum := sha1.Sum([]byte(request.URL.String()))
hash := hex.EncodeToString(sum[:])
dir := path.Join(cacheDir, hash[:2])
filename := path.Join(dir, hash)
if file, err := os.Open(filename); err == nil {
resp := new(Response)
err := gob.NewDecoder(file).Decode(resp)
file.Close()
checkHeadersFunc(request, resp.StatusCode, *resp.Headers)
if resp.StatusCode < 500 {
return resp, err
}
}
resp, err := h.Do(request, bodySize, checkHeadersFunc)
if err != nil || resp.StatusCode >= 500 {
return resp, err
}
if _, err := os.Stat(dir); err != nil {
if err := os.MkdirAll(dir, 0750); err != nil {
return resp, err
}
}
file, err := os.Create(filename + "~")
if err != nil {
return resp, err
}
if err := gob.NewEncoder(file).Encode(resp); err != nil {
file.Close()
return resp, err
}
file.Close()
return resp, os.Rename(filename+"~", filename)
}
func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc) (*Response, error) {
r := h.GetMatchingRule(request.URL.Host)
if r != nil {
r.waitChan <- true
defer func(r *LimitRule) {
randomDelay := time.Duration(0)
if r.RandomDelay != 0 {
randomDelay = time.Duration(rand.Int63n(int64(r.RandomDelay)))
}
time.Sleep(r.Delay + randomDelay)
<-r.waitChan
}(r)
}
res, err := h.Client.Do(request)
if err != nil {
return nil, err
}
defer res.Body.Close()
finalRequest := request
if res.Request != nil {
finalRequest = res.Request
}
if !checkHeadersFunc(finalRequest, res.StatusCode, res.Header) {
// closing res.Body (see defer above) without reading it aborts
// the download
return nil, ErrAbortedAfterHeaders
}
var bodyReader io.Reader = res.Body
if bodySize > 0 {
bodyReader = io.LimitReader(bodyReader, int64(bodySize))
}
contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding"))
if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) {
bodyReader, err = gzip.NewReader(bodyReader)
if err != nil {
return nil, err
}
defer bodyReader.(*gzip.Reader).Close()
}
body, err := io.ReadAll(bodyReader)
if err != nil {
return nil, err
}
return &Response{
StatusCode: res.StatusCode,
Body: body,
Headers: &res.Header,
}, nil
}
func (h *httpBackend) Limit(rule *LimitRule) error {
h.lock.Lock()
if h.LimitRules == nil {
h.LimitRules = make([]*LimitRule, 0, 8)
}
h.LimitRules = append(h.LimitRules, rule)
h.lock.Unlock()
return rule.Init()
}
func (h *httpBackend) Limits(rules []*LimitRule) error {
for _, r := range rules {
if err := h.Limit(r); err != nil {
return err
}
}
return nil
}
+37
View File
@@ -0,0 +1,37 @@
package colly
import (
"net/http"
"net/http/httptrace"
"time"
)
// HTTPTrace provides a datastructure for storing an http trace.
type HTTPTrace struct {
start, connect time.Time
ConnectDuration time.Duration
FirstByteDuration time.Duration
}
// trace returns a httptrace.ClientTrace object to be used with an http
// request via httptrace.WithClientTrace() that fills in the HttpTrace.
func (ht *HTTPTrace) trace() *httptrace.ClientTrace {
trace := &httptrace.ClientTrace{
ConnectStart: func(network, addr string) { ht.connect = time.Now() },
ConnectDone: func(network, addr string, err error) {
ht.ConnectDuration = time.Since(ht.connect)
},
GetConn: func(hostPort string) { ht.start = time.Now() },
GotFirstResponseByte: func() {
ht.FirstByteDuration = time.Since(ht.start)
},
}
return trace
}
// WithTrace returns the given HTTP Request with this HTTPTrace added to its
// context.
func (ht *HTTPTrace) WithTrace(req *http.Request) *http.Request {
return req.WithContext(httptrace.WithClientTrace(req.Context(), ht.trace()))
}
+196
View File
@@ -0,0 +1,196 @@
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"bytes"
"encoding/json"
"io"
"net/http"
"net/url"
"strings"
"sync/atomic"
)
// Request is the representation of a HTTP request made by a Collector
type Request struct {
// URL is the parsed URL of the HTTP request
URL *url.URL
// Headers contains the Request's HTTP headers
Headers *http.Header
// the Host header
Host string
// Ctx is a context between a Request and a Response
Ctx *Context
// Depth is the number of the parents of the request
Depth int
// Method is the HTTP method of the request
Method string
// Body is the request body which is used on POST/PUT requests
Body io.Reader
// ResponseCharacterencoding is the character encoding of the response body.
// Leave it blank to allow automatic character encoding of the response body.
// It is empty by default and it can be set in OnRequest callback.
ResponseCharacterEncoding string
// ID is the Unique identifier of the request
ID uint32
collector *Collector
abort bool
baseURL *url.URL
// ProxyURL is the proxy address that handles the request
ProxyURL string
}
type serializableRequest struct {
URL string
Method string
Depth int
Body []byte
ID uint32
Ctx map[string]interface{}
Headers http.Header
Host string
}
// New creates a new request with the context of the original request
func (r *Request) New(method, URL string, body io.Reader) (*Request, error) {
u, err := urlParser.Parse(URL)
if err != nil {
return nil, err
}
u2, err := url.Parse(u.Href(false))
if err != nil {
return nil, err
}
return &Request{
Method: method,
URL: u2,
Body: body,
Ctx: r.Ctx,
Headers: &http.Header{},
Host: r.Host,
ID: atomic.AddUint32(&r.collector.requestCount, 1),
collector: r.collector,
}, nil
}
// Abort cancels the HTTP request when called in an OnRequest callback
func (r *Request) Abort() {
r.abort = true
}
// AbsoluteURL returns with the resolved absolute URL of an URL chunk.
// AbsoluteURL returns empty string if the URL chunk is a fragment or
// could not be parsed
func (r *Request) AbsoluteURL(u string) string {
if strings.HasPrefix(u, "#") {
return ""
}
var base *url.URL
if r.baseURL != nil {
base = r.baseURL
} else {
base = r.URL
}
absURL, err := urlParser.ParseRef(base.String(), u)
if err != nil {
return ""
}
return absURL.Href(false)
}
// Visit continues Collector's collecting job by creating a
// request and preserves the Context of the previous request.
// Visit also calls the previously provided callbacks
func (r *Request) Visit(URL string) error {
return r.collector.scrape(r.AbsoluteURL(URL), "GET", r.Depth+1, nil, r.Ctx, nil, true)
}
// HasVisited checks if the provided URL has been visited
func (r *Request) HasVisited(URL string) (bool, error) {
return r.collector.HasVisited(URL)
}
// Post continues a collector job by creating a POST request and preserves the Context
// of the previous request.
// Post also calls the previously provided callbacks
func (r *Request) Post(URL string, requestData map[string]string) error {
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createFormReader(requestData), r.Ctx, nil, true)
}
// PostRaw starts a collector job by creating a POST request with raw binary data.
// PostRaw preserves the Context of the previous request
// and calls the previously provided callbacks
func (r *Request) PostRaw(URL string, requestData []byte) error {
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, bytes.NewReader(requestData), r.Ctx, nil, true)
}
// PostMultipart starts a collector job by creating a Multipart POST request
// with raw binary data. PostMultipart also calls the previously provided.
// callbacks
func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error {
boundary := randomBoundary()
hdr := http.Header{}
hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary)
hdr.Set("User-Agent", r.collector.UserAgent)
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createMultipartReader(boundary, requestData), r.Ctx, hdr, true)
}
// Retry submits HTTP request again with the same parameters
func (r *Request) Retry() error {
r.Headers.Del("Cookie")
if _, ok := r.Body.(io.ReadSeeker); r.Body != nil && !ok {
return ErrRetryBodyUnseekable
}
return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, false)
}
// Do submits the request
func (r *Request) Do() error {
return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, !r.collector.AllowURLRevisit)
}
// Marshal serializes the Request
func (r *Request) Marshal() ([]byte, error) {
ctx := make(map[string]interface{})
if r.Ctx != nil {
r.Ctx.ForEach(func(k string, v interface{}) interface{} {
ctx[k] = v
return nil
})
}
var err error
var body []byte
if r.Body != nil {
body, err = io.ReadAll(r.Body)
if err != nil {
return nil, err
}
}
sr := &serializableRequest{
URL: r.URL.String(),
Host: r.Host,
Method: r.Method,
Depth: r.Depth,
Body: body,
ID: r.ID,
Ctx: ctx,
}
if r.Headers != nil {
sr.Headers = *r.Headers
}
return json.Marshal(sr)
}
+116
View File
@@ -0,0 +1,116 @@
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"bytes"
"fmt"
"io"
"mime"
"net/http"
"os"
"strings"
"github.com/saintfish/chardet"
"golang.org/x/net/html/charset"
)
// Response is the representation of a HTTP response made by a Collector
type Response struct {
// StatusCode is the status code of the Response
StatusCode int
// Body is the content of the Response
Body []byte
// Ctx is a context between a Request and a Response
Ctx *Context
// Request is the Request object of the response
Request *Request
// Headers contains the Response's HTTP headers
Headers *http.Header
// Trace contains the HTTPTrace for the request. Will only be set by the
// collector if Collector.TraceHTTP is set to true.
Trace *HTTPTrace
}
// Save writes response body to disk
func (r *Response) Save(fileName string) error {
return os.WriteFile(fileName, r.Body, 0644)
}
// FileName returns the sanitized file name parsed from "Content-Disposition"
// header or from URL
func (r *Response) FileName() string {
_, params, err := mime.ParseMediaType(r.Headers.Get("Content-Disposition"))
if fName, ok := params["filename"]; ok && err == nil {
return SanitizeFileName(fName)
}
if r.Request.URL.RawQuery != "" {
return SanitizeFileName(fmt.Sprintf("%s_%s", r.Request.URL.Path, r.Request.URL.RawQuery))
}
return SanitizeFileName(strings.TrimPrefix(r.Request.URL.Path, "/"))
}
func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error {
if len(r.Body) == 0 {
return nil
}
if defaultEncoding != "" {
tmpBody, err := encodeBytes(r.Body, "text/plain; charset="+defaultEncoding)
if err != nil {
return err
}
r.Body = tmpBody
return nil
}
contentType := strings.ToLower(r.Headers.Get("Content-Type"))
if strings.Contains(contentType, "image/") ||
strings.Contains(contentType, "video/") ||
strings.Contains(contentType, "audio/") ||
strings.Contains(contentType, "font/") {
// These MIME types should not have textual data.
return nil
}
if !strings.Contains(contentType, "charset") {
if !detectCharset {
return nil
}
d := chardet.NewTextDetector()
r, err := d.DetectBest(r.Body)
if err != nil {
return err
}
contentType = "text/plain; charset=" + r.Charset
}
if strings.Contains(contentType, "utf-8") || strings.Contains(contentType, "utf8") {
return nil
}
tmpBody, err := encodeBytes(r.Body, contentType)
if err != nil {
return err
}
r.Body = tmpBody
return nil
}
func encodeBytes(b []byte, contentType string) ([]byte, error) {
r, err := charset.NewReader(bytes.NewReader(b), contentType)
if err != nil {
return nil, err
}
return io.ReadAll(r)
}
+128
View File
@@ -0,0 +1,128 @@
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package storage
import (
"net/http"
"net/http/cookiejar"
"net/url"
"strings"
"sync"
)
// Storage is an interface which handles Collector's internal data,
// like visited urls and cookies.
// The default Storage of the Collector is the InMemoryStorage.
// Collector's storage can be changed by calling Collector.SetStorage()
// function.
type Storage interface {
// Init initializes the storage
Init() error
// Visited receives and stores a request ID that is visited by the Collector
Visited(requestID uint64) error
// IsVisited returns true if the request was visited before IsVisited
// is called
IsVisited(requestID uint64) (bool, error)
// Cookies retrieves stored cookies for a given host
Cookies(u *url.URL) string
// SetCookies stores cookies for a given host
SetCookies(u *url.URL, cookies string)
}
// InMemoryStorage is the default storage backend of colly.
// InMemoryStorage keeps cookies and visited urls in memory
// without persisting data on the disk.
type InMemoryStorage struct {
visitedURLs map[uint64]bool
lock *sync.RWMutex
jar *cookiejar.Jar
}
// Init initializes InMemoryStorage
func (s *InMemoryStorage) Init() error {
if s.visitedURLs == nil {
s.visitedURLs = make(map[uint64]bool)
}
if s.lock == nil {
s.lock = &sync.RWMutex{}
}
if s.jar == nil {
var err error
s.jar, err = cookiejar.New(nil)
return err
}
return nil
}
// Visited implements Storage.Visited()
func (s *InMemoryStorage) Visited(requestID uint64) error {
s.lock.Lock()
s.visitedURLs[requestID] = true
s.lock.Unlock()
return nil
}
// IsVisited implements Storage.IsVisited()
func (s *InMemoryStorage) IsVisited(requestID uint64) (bool, error) {
s.lock.RLock()
visited := s.visitedURLs[requestID]
s.lock.RUnlock()
return visited, nil
}
// Cookies implements Storage.Cookies()
func (s *InMemoryStorage) Cookies(u *url.URL) string {
return StringifyCookies(s.jar.Cookies(u))
}
// SetCookies implements Storage.SetCookies()
func (s *InMemoryStorage) SetCookies(u *url.URL, cookies string) {
s.jar.SetCookies(u, UnstringifyCookies(cookies))
}
// Close implements Storage.Close()
func (s *InMemoryStorage) Close() error {
return nil
}
// StringifyCookies serializes list of http.Cookies to string
func StringifyCookies(cookies []*http.Cookie) string {
// Stringify cookies.
cs := make([]string, len(cookies))
for i, c := range cookies {
cs[i] = c.String()
}
return strings.Join(cs, "\n")
}
// UnstringifyCookies deserializes a cookie string to http.Cookies
func UnstringifyCookies(s string) []*http.Cookie {
h := http.Header{}
for _, c := range strings.Split(s, "\n") {
h.Add("Set-Cookie", c)
}
r := http.Response{Header: h}
return r.Cookies()
}
// ContainsCookie checks if a cookie name is represented in cookies
func ContainsCookie(cookies []*http.Cookie, name string) bool {
for _, c := range cookies {
if c.Name == name {
return true
}
}
return false
}
+218
View File
@@ -0,0 +1,218 @@
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"errors"
"reflect"
"strings"
"github.com/PuerkitoBio/goquery"
)
// Unmarshal is a shorthand for colly.UnmarshalHTML
func (h *HTMLElement) Unmarshal(v interface{}) error {
return UnmarshalHTML(v, h.DOM, nil)
}
// UnmarshalWithMap is a shorthand for colly.UnmarshalHTML, extended to allow maps to be passed in.
func (h *HTMLElement) UnmarshalWithMap(v interface{}, structMap map[string]string) error {
return UnmarshalHTML(v, h.DOM, structMap)
}
// UnmarshalHTML declaratively extracts text or attributes to a struct from
// HTML response using struct tags composed of css selectors.
// Allowed struct tags:
// - "selector" (required): CSS (goquery) selector of the desired data
// - "attr" (optional): Selects the matching element's attribute's value.
// Leave it blank or omit to get the text of the element.
//
// Example struct declaration:
//
// type Nested struct {
// String string `selector:"div > p"`
// Classes []string `selector:"li" attr:"class"`
// Struct *Nested `selector:"div > div"`
// }
//
// Supported types: struct, *struct, string, []string
func UnmarshalHTML(v interface{}, s *goquery.Selection, structMap map[string]string) error {
rv := reflect.ValueOf(v)
if rv.Kind() != reflect.Ptr || rv.IsNil() {
return errors.New("Invalid type or nil-pointer")
}
sv := rv.Elem()
st := reflect.TypeOf(v).Elem()
if structMap != nil {
for k, v := range structMap {
attrV := sv.FieldByName(k)
if !attrV.CanAddr() || !attrV.CanSet() {
continue
}
if err := unmarshalSelector(s, attrV, v); err != nil {
return err
}
}
} else {
for i := 0; i < sv.NumField(); i++ {
attrV := sv.Field(i)
if !attrV.CanAddr() || !attrV.CanSet() {
continue
}
if err := unmarshalAttr(s, attrV, st.Field(i)); err != nil {
return err
}
}
}
return nil
}
func unmarshalSelector(s *goquery.Selection, attrV reflect.Value, selector string) error {
//selector is "-" specify that field should ignore.
if selector == "-" {
return nil
}
htmlAttr := ""
// TODO support more types
switch attrV.Kind() {
case reflect.Slice:
if err := unmarshalSlice(s, selector, htmlAttr, attrV); err != nil {
return err
}
case reflect.String:
val := getDOMValue(s.Find(selector), htmlAttr)
attrV.Set(reflect.Indirect(reflect.ValueOf(val)))
case reflect.Struct:
if err := unmarshalStruct(s, selector, attrV); err != nil {
return err
}
case reflect.Ptr:
if err := unmarshalPtr(s, selector, attrV); err != nil {
return err
}
default:
return errors.New("Invalid type: " + attrV.String())
}
return nil
}
func unmarshalAttr(s *goquery.Selection, attrV reflect.Value, attrT reflect.StructField) error {
selector := attrT.Tag.Get("selector")
//selector is "-" specify that field should ignore.
if selector == "-" {
return nil
}
htmlAttr := attrT.Tag.Get("attr")
// TODO support more types
switch attrV.Kind() {
case reflect.Slice:
if err := unmarshalSlice(s, selector, htmlAttr, attrV); err != nil {
return err
}
case reflect.String:
val := getDOMValue(s.Find(selector), htmlAttr)
attrV.Set(reflect.Indirect(reflect.ValueOf(val)))
case reflect.Struct:
if err := unmarshalStruct(s, selector, attrV); err != nil {
return err
}
case reflect.Ptr:
if err := unmarshalPtr(s, selector, attrV); err != nil {
return err
}
default:
return errors.New("Invalid type: " + attrV.String())
}
return nil
}
func unmarshalStruct(s *goquery.Selection, selector string, attrV reflect.Value) error {
newS := s
if selector != "" {
newS = newS.Find(selector)
}
if newS.Nodes == nil {
return nil
}
v := reflect.New(attrV.Type())
err := UnmarshalHTML(v.Interface(), newS, nil)
if err != nil {
return err
}
attrV.Set(reflect.Indirect(v))
return nil
}
func unmarshalPtr(s *goquery.Selection, selector string, attrV reflect.Value) error {
newS := s
if selector != "" {
newS = newS.Find(selector)
}
if newS.Nodes == nil {
return nil
}
e := attrV.Type().Elem()
if e.Kind() != reflect.Struct {
return errors.New("Invalid slice type")
}
v := reflect.New(e)
err := UnmarshalHTML(v.Interface(), newS, nil)
if err != nil {
return err
}
attrV.Set(v)
return nil
}
func unmarshalSlice(s *goquery.Selection, selector, htmlAttr string, attrV reflect.Value) error {
if attrV.Pointer() == 0 {
v := reflect.MakeSlice(attrV.Type(), 0, 0)
attrV.Set(v)
}
switch attrV.Type().Elem().Kind() {
case reflect.String:
s.Find(selector).Each(func(_ int, s *goquery.Selection) {
val := getDOMValue(s, htmlAttr)
attrV.Set(reflect.Append(attrV, reflect.Indirect(reflect.ValueOf(val))))
})
case reflect.Ptr:
s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) {
someVal := reflect.New(attrV.Type().Elem().Elem())
UnmarshalHTML(someVal.Interface(), innerSel, nil)
attrV.Set(reflect.Append(attrV, someVal))
})
case reflect.Struct:
s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) {
someVal := reflect.New(attrV.Type().Elem())
UnmarshalHTML(someVal.Interface(), innerSel, nil)
attrV.Set(reflect.Append(attrV, reflect.Indirect(someVal)))
})
default:
return errors.New("Invalid slice type")
}
return nil
}
func getDOMValue(s *goquery.Selection, attr string) string {
if attr == "" {
return strings.TrimSpace(s.First().Text())
}
attrV, _ := s.Attr(attr)
return attrV
}
+169
View File
@@ -0,0 +1,169 @@
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"strings"
"github.com/antchfx/htmlquery"
"github.com/antchfx/xmlquery"
"golang.org/x/net/html"
)
// XMLElement is the representation of a XML tag.
type XMLElement struct {
// Name is the name of the tag
Name string
Text string
attributes interface{}
// Request is the request object of the element's HTML document
Request *Request
// Response is the Response object of the element's HTML document
Response *Response
// DOM is the DOM object of the page. DOM is relative
// to the current XMLElement and is either a html.Node or xmlquery.Node
// based on how the XMLElement was created.
DOM interface{}
isHTML bool
}
// NewXMLElementFromHTMLNode creates a XMLElement from a html.Node.
func NewXMLElementFromHTMLNode(resp *Response, s *html.Node) *XMLElement {
return &XMLElement{
Name: s.Data,
Request: resp.Request,
Response: resp,
Text: htmlquery.InnerText(s),
DOM: s,
attributes: s.Attr,
isHTML: true,
}
}
// NewXMLElementFromXMLNode creates a XMLElement from a xmlquery.Node.
func NewXMLElementFromXMLNode(resp *Response, s *xmlquery.Node) *XMLElement {
return &XMLElement{
Name: s.Data,
Request: resp.Request,
Response: resp,
Text: s.InnerText(),
DOM: s,
attributes: s.Attr,
isHTML: false,
}
}
// Attr returns the selected attribute of a HTMLElement or empty string
// if no attribute found
func (h *XMLElement) Attr(k string) string {
if h.isHTML {
for _, a := range h.attributes.([]html.Attribute) {
if a.Key == k {
return a.Val
}
}
} else {
for _, a := range h.attributes.([]xmlquery.Attr) {
if a.Name.Local == k {
return a.Value
}
}
}
return ""
}
// ChildText returns the concatenated and stripped text content of the matching
// elements.
func (h *XMLElement) ChildText(xpathQuery string) string {
if h.isHTML {
child := htmlquery.FindOne(h.DOM.(*html.Node), xpathQuery)
if child == nil {
return ""
}
return strings.TrimSpace(htmlquery.InnerText(child))
}
child := xmlquery.FindOne(h.DOM.(*xmlquery.Node), xpathQuery)
if child == nil {
return ""
}
return strings.TrimSpace(child.InnerText())
}
// ChildAttr returns the stripped text content of the first matching
// element's attribute.
func (h *XMLElement) ChildAttr(xpathQuery, attrName string) string {
if h.isHTML {
child := htmlquery.FindOne(h.DOM.(*html.Node), xpathQuery)
if child != nil {
for _, attr := range child.Attr {
if attr.Key == attrName {
return strings.TrimSpace(attr.Val)
}
}
}
} else {
child := xmlquery.FindOne(h.DOM.(*xmlquery.Node), xpathQuery)
if child != nil {
for _, attr := range child.Attr {
if attr.Name.Local == attrName {
return strings.TrimSpace(attr.Value)
}
}
}
}
return ""
}
// ChildAttrs returns the stripped text content of all the matching
// element's attributes.
func (h *XMLElement) ChildAttrs(xpathQuery, attrName string) []string {
var res []string
if h.isHTML {
for _, child := range htmlquery.Find(h.DOM.(*html.Node), xpathQuery) {
for _, attr := range child.Attr {
if attr.Key == attrName {
res = append(res, strings.TrimSpace(attr.Val))
}
}
}
} else {
xmlquery.FindEach(h.DOM.(*xmlquery.Node), xpathQuery, func(i int, child *xmlquery.Node) {
for _, attr := range child.Attr {
if attr.Name.Local == attrName {
res = append(res, strings.TrimSpace(attr.Value))
}
}
})
}
return res
}
// ChildTexts returns an array of strings corresponding to child elements that match the xpath query.
// Each item in the array is the stripped text content of the corresponding matching child element.
func (h *XMLElement) ChildTexts(xpathQuery string) []string {
texts := make([]string, 0)
if h.isHTML {
for _, child := range htmlquery.Find(h.DOM.(*html.Node), xpathQuery) {
texts = append(texts, strings.TrimSpace(htmlquery.InnerText(child)))
}
} else {
xmlquery.FindEach(h.DOM.(*xmlquery.Node), xpathQuery, func(i int, child *xmlquery.Node) {
texts = append(texts, strings.TrimSpace(child.InnerText()))
})
}
return texts
}