110 lines
3.0 KiB
Go
110 lines
3.0 KiB
Go
package crawler
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log"
|
|
"math/rand"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/gocolly/colly/v2"
|
|
)
|
|
|
|
// ... userAgents, referers, regex vars ...
|
|
var userAgents = []string{
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/117.0",
|
|
}
|
|
var referers = []string{
|
|
"https://www.google.com/",
|
|
"https://www.hermes.com/us/en/",
|
|
}
|
|
var blockRegex = regexp.MustCompile(`(?i)Access\s+blocked`)
|
|
|
|
func init() {
|
|
rand.New(rand.NewSource(time.Now().UnixNano()))
|
|
}
|
|
|
|
func Scrape(ctx context.Context, url string) (map[string]Bag, error) {
|
|
var blockedError error
|
|
var responseBody []byte // Variable to store the decompressed body
|
|
c := colly.NewCollector()
|
|
|
|
c.Limit(&colly.LimitRule{
|
|
DomainGlob: "*",
|
|
RandomDelay: 2 * time.Second,
|
|
})
|
|
|
|
c.OnRequest(func(r *colly.Request) {
|
|
r.Headers.Set("User-Agent", userAgents[rand.Intn(len(userAgents))])
|
|
r.Headers.Set("Referer", referers[rand.Intn(len(referers))])
|
|
// By default, Colly adds the correct Accept-Encoding header.
|
|
// We don't need to set it manually.
|
|
})
|
|
|
|
// The OnResponse callback now just saves the body for later.
|
|
c.OnResponse(func(r *colly.Response) {
|
|
responseBody = r.Body
|
|
})
|
|
|
|
foundBags := make(map[string]Bag)
|
|
skuRegex := regexp.MustCompile(`-H\w+$`)
|
|
|
|
c.OnHTML("div.product-grid-list-item", func(e *colly.HTMLElement) {
|
|
// ... HTML parsing logic remains the same ...
|
|
idStr := e.Attr("id")
|
|
skuMatch := skuRegex.FindString(idStr)
|
|
if skuMatch == "" {
|
|
return
|
|
}
|
|
sku := strings.TrimPrefix(skuMatch, "-")
|
|
|
|
unavailableIndicator := e.DOM.Find("span:contains('Unavailable')")
|
|
isAvailable := unavailableIndicator.Length() == 0
|
|
|
|
bag := Bag{
|
|
SKU: sku,
|
|
Name: e.ChildText("span.product-title"),
|
|
URL: e.Request.AbsoluteURL(e.ChildAttr("a.product-item-name", "href")),
|
|
ImageURL: "https:" + e.ChildAttr("img[id^='img-']", "src"),
|
|
Availability: isAvailable,
|
|
}
|
|
|
|
if bag.Name != "" && bag.URL != "" {
|
|
foundBags[sku] = bag
|
|
}
|
|
})
|
|
|
|
c.OnScraped(func(r *colly.Response) {
|
|
// The response body is now guaranteed to be decompressed and readable.
|
|
if isDebug, ok := ctx.Value(DebugContextKey).(bool); ok && isDebug {
|
|
log.Printf("[DEBUG] Raw HTML response received:\n%s\n", string(responseBody))
|
|
}
|
|
|
|
// Check for the block message on the readable content.
|
|
if blockRegex.Match(responseBody) {
|
|
blockedError = errors.New("request failed: hit anti-crawling wall (message found in page)")
|
|
}
|
|
})
|
|
|
|
c.OnError(func(r *colly.Response, err error) {
|
|
log.Printf("❌ Colly request error for %s (Status: %d): %v", r.Request.URL, r.StatusCode, err)
|
|
})
|
|
|
|
err := c.Visit(url)
|
|
|
|
if blockedError != nil {
|
|
return nil, blockedError
|
|
}
|
|
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not complete visit to %s: %w", url, err)
|
|
}
|
|
|
|
return foundBags, nil
|
|
}
|