package crawler import ( "context" "errors" "fmt" "log" "math/rand" "regexp" "strings" "time" "github.com/gocolly/colly/v2" ) // ... userAgents, referers, regex vars ... var userAgents = []string{ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/117.0", } var referers = []string{ "https://www.google.com/", "https://www.hermes.com/us/en/", } var blockRegex = regexp.MustCompile(`(?i)Access\s+blocked`) func init() { rand.New(rand.NewSource(time.Now().UnixNano())) } func Scrape(ctx context.Context, url string) (map[string]Bag, error) { var blockedError error var responseBody []byte // Variable to store the decompressed body c := colly.NewCollector() c.Limit(&colly.LimitRule{ DomainGlob: "*", RandomDelay: 2 * time.Second, }) c.OnRequest(func(r *colly.Request) { r.Headers.Set("User-Agent", userAgents[rand.Intn(len(userAgents))]) r.Headers.Set("Referer", referers[rand.Intn(len(referers))]) // By default, Colly adds the correct Accept-Encoding header. // We don't need to set it manually. }) // The OnResponse callback now just saves the body for later. c.OnResponse(func(r *colly.Response) { responseBody = r.Body }) foundBags := make(map[string]Bag) skuRegex := regexp.MustCompile(`-H\w+$`) c.OnHTML("div.product-grid-list-item", func(e *colly.HTMLElement) { // ... HTML parsing logic remains the same ... idStr := e.Attr("id") skuMatch := skuRegex.FindString(idStr) if skuMatch == "" { return } sku := strings.TrimPrefix(skuMatch, "-") unavailableIndicator := e.DOM.Find("span:contains('Unavailable')") isAvailable := unavailableIndicator.Length() == 0 bag := Bag{ SKU: sku, Name: e.ChildText("span.product-title"), URL: e.Request.AbsoluteURL(e.ChildAttr("a.product-item-name", "href")), ImageURL: "https:" + e.ChildAttr("img[id^='img-']", "src"), Availability: isAvailable, } if bag.Name != "" && bag.URL != "" { foundBags[sku] = bag } }) c.OnScraped(func(r *colly.Response) { // The response body is now guaranteed to be decompressed and readable. if isDebug, ok := ctx.Value(DebugContextKey).(bool); ok && isDebug { log.Printf("[DEBUG] Raw HTML response received:\n%s\n", string(responseBody)) } // Check for the block message on the readable content. if blockRegex.Match(responseBody) { blockedError = errors.New("request failed: hit anti-crawling wall (message found in page)") } }) c.OnError(func(r *colly.Response, err error) { log.Printf("❌ Colly request error for %s (Status: %d): %v", r.Request.URL, r.StatusCode, err) }) err := c.Visit(url) if blockedError != nil { return nil, blockedError } if err != nil { return nil, fmt.Errorf("could not complete visit to %s: %w", url, err) } return foundBags, nil }