feat: Inital commit

This commit is contained in:
2025-07-26 05:58:59 +00:00
commit 753d1c60ea
1849 changed files with 830533 additions and 0 deletions
+109
View File
@@ -0,0 +1,109 @@
package crawler
import (
"context"
"errors"
"fmt"
"log"
"math/rand"
"regexp"
"strings"
"time"
"github.com/gocolly/colly/v2"
)
// ... userAgents, referers, regex vars ...
var userAgents = []string{
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/117.0",
}
var referers = []string{
"https://www.google.com/",
"https://www.hermes.com/us/en/",
}
var blockRegex = regexp.MustCompile(`(?i)Access\s+blocked`)
func init() {
rand.New(rand.NewSource(time.Now().UnixNano()))
}
func Scrape(ctx context.Context, url string) (map[string]Bag, error) {
var blockedError error
var responseBody []byte // Variable to store the decompressed body
c := colly.NewCollector()
c.Limit(&colly.LimitRule{
DomainGlob: "*",
RandomDelay: 2 * time.Second,
})
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("User-Agent", userAgents[rand.Intn(len(userAgents))])
r.Headers.Set("Referer", referers[rand.Intn(len(referers))])
// By default, Colly adds the correct Accept-Encoding header.
// We don't need to set it manually.
})
// The OnResponse callback now just saves the body for later.
c.OnResponse(func(r *colly.Response) {
responseBody = r.Body
})
foundBags := make(map[string]Bag)
skuRegex := regexp.MustCompile(`-H\w+$`)
c.OnHTML("div.product-grid-list-item", func(e *colly.HTMLElement) {
// ... HTML parsing logic remains the same ...
idStr := e.Attr("id")
skuMatch := skuRegex.FindString(idStr)
if skuMatch == "" {
return
}
sku := strings.TrimPrefix(skuMatch, "-")
unavailableIndicator := e.DOM.Find("span:contains('Unavailable')")
isAvailable := unavailableIndicator.Length() == 0
bag := Bag{
SKU: sku,
Name: e.ChildText("span.product-title"),
URL: e.Request.AbsoluteURL(e.ChildAttr("a.product-item-name", "href")),
ImageURL: "https:" + e.ChildAttr("img[id^='img-']", "src"),
Availability: isAvailable,
}
if bag.Name != "" && bag.URL != "" {
foundBags[sku] = bag
}
})
c.OnScraped(func(r *colly.Response) {
// The response body is now guaranteed to be decompressed and readable.
if isDebug, ok := ctx.Value(DebugContextKey).(bool); ok && isDebug {
log.Printf("[DEBUG] Raw HTML response received:\n%s\n", string(responseBody))
}
// Check for the block message on the readable content.
if blockRegex.Match(responseBody) {
blockedError = errors.New("request failed: hit anti-crawling wall (message found in page)")
}
})
c.OnError(func(r *colly.Response, err error) {
log.Printf("❌ Colly request error for %s (Status: %d): %v", r.Request.URL, r.StatusCode, err)
})
err := c.Visit(url)
if blockedError != nil {
return nil, blockedError
}
if err != nil {
return nil, fmt.Errorf("could not complete visit to %s: %w", url, err)
}
return foundBags, nil
}
+74
View File
@@ -0,0 +1,74 @@
package crawler_test
import (
"context"
"fmt"
"strings"
"testing"
"git.pengzhan.dev/aimaren/internal/crawler"
)
const (
// The live URL to be used for the end-to-end test.
testURL = "https://www.hermes.com/us/en/category/women/bags-and-small-leather-goods/bags-and-clutches/"
)
// TestScrape_EndToEnd performs a live test against the Hermès website.
// NOTE: This test makes a real network request and may fail due to network issues,
// IP blocking, or changes on the live website.
func TestScrape_EndToEnd(t *testing.T) {
t.Log("🚀 Starting end-to-end test against:", testURL)
// Requirement 1 & 2: Get HTML content and generate bags.
// The Scrape function handles the HTTP request internally.
// If the response is not 200 OK, it will return an error.
bags, err := crawler.Scrape(context.WithValue(t.Context(), crawler.DebugContextKey, true), testURL)
if err != nil {
t.Fatalf("❌ Test Failed: The scrape function returned an error. This could be a network issue or a non-200 response from the server. Error: %v", err)
}
// Requirement 2: Fail if the result is empty.
if len(bags) == 0 {
t.Fatalf("❌ Test Failed: Scraper found 0 items. The website's HTML structure has likely changed completely, or the request was blocked.")
}
t.Logf("✅ Successfully scraped %d items. Performing data validation...", len(bags))
var availableCount, unavailableCount int
var sampleBags []string
for sku, bag := range bags {
// Requirement 4 (Others): Sanity check each parsed item.
if bag.Name == "" {
t.Errorf("❌ Test Failed: Bag with SKU %s has an empty Name.", sku)
}
if !strings.HasPrefix(bag.URL, "http") {
t.Errorf("❌ Test Failed: Bag with SKU %s has an invalid URL: %s", sku, bag.URL)
}
// Requirement 3: Count availability for health check.
if bag.Availability {
availableCount++
} else {
unavailableCount++
}
// Collect a few samples for logging.
if len(sampleBags) < 3 {
sampleBags = append(sampleBags, fmt.Sprintf(" - %s (Available: %t)", bag.Name, bag.Availability))
}
}
// Log statistics for review.
t.Logf("📊 Availability Stats: %d Available, %d Unavailable", availableCount, unavailableCount)
t.Logf("✨ Sample Items:\n%s", strings.Join(sampleBags, "\n"))
// Requirement 3: Warn if availability is homogenous.
if availableCount == 0 || unavailableCount == 0 {
// This is a warning, not a failure. It flags a potential issue with the availability logic.
t.Logf("⚠️ WARNING: All scraped bags have the same availability status. The 'unavailable' indicator on the website may have changed, causing our logic to be incorrect.")
}
t.Log("✅ End-to-end test completed successfully.")
}
+15
View File
@@ -0,0 +1,15 @@
package crawler
import "time"
// Bag holds the metadata for a single product.
type Bag struct {
SKU string `firestore:"sku"`
Name string `firestore:"name"`
URL string `firestore:"url"`
ImageURL string `firestore:"imageURL"`
Availability bool `firestore:"availability"`
CreatedTimestamp time.Time `firestore:"createdTimestamp,serverTimestamp"`
UpdatedTimestamp time.Time `firestore:"updatedTimestamp,serverTimestamp"`
DeleteTimestamp *time.Time `firestore:"deleteTimestamp,omitempty"`
}
+7
View File
@@ -0,0 +1,7 @@
package crawler
// Define a custom type for our context key to avoid collisions.
type contextKey string
// DebugContextKey is the key for the debug flag in the context.
const DebugContextKey contextKey = "debug"