feat: Inital commit
This commit is contained in:
@@ -0,0 +1,42 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"github.com/spf13/viper"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
GCPProjectID string `mapstructure:"gcp_project_id"`
|
||||
Telegram struct {
|
||||
Token string `mapstructure:"token"`
|
||||
ChatIDs []int64 `mapstructure:"chat_ids"`
|
||||
} `mapstructure:"telegram"`
|
||||
}
|
||||
|
||||
func Load() (Config, error) {
|
||||
viper.SetConfigName("config")
|
||||
viper.SetConfigType("yaml")
|
||||
viper.AddConfigPath(".")
|
||||
|
||||
viper.SetEnvPrefix("CRAWLER")
|
||||
viper.SetEnvKeyReplacer(strings.NewReplacer(".", "_"))
|
||||
viper.AutomaticEnv()
|
||||
|
||||
var cfg Config
|
||||
if err := viper.ReadInConfig(); err != nil {
|
||||
// If the error is that the file wasn't found, that's okay. Log it and continue.
|
||||
if _, ok := err.(viper.ConfigFileNotFoundError); ok {
|
||||
log.Println("ℹ️ No 'config.yaml' file found. Relying on environment variables or flags.")
|
||||
} else {
|
||||
// For any other error (e.g., malformed YAML), return the error.
|
||||
return cfg, err
|
||||
}
|
||||
}
|
||||
|
||||
if err := viper.Unmarshal(&cfg); err != nil {
|
||||
return cfg, err
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
@@ -0,0 +1,109 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"math/rand"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly/v2"
|
||||
)
|
||||
|
||||
// ... userAgents, referers, regex vars ...
|
||||
var userAgents = []string{
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/117.0",
|
||||
}
|
||||
var referers = []string{
|
||||
"https://www.google.com/",
|
||||
"https://www.hermes.com/us/en/",
|
||||
}
|
||||
var blockRegex = regexp.MustCompile(`(?i)Access\s+blocked`)
|
||||
|
||||
func init() {
|
||||
rand.New(rand.NewSource(time.Now().UnixNano()))
|
||||
}
|
||||
|
||||
func Scrape(ctx context.Context, url string) (map[string]Bag, error) {
|
||||
var blockedError error
|
||||
var responseBody []byte // Variable to store the decompressed body
|
||||
c := colly.NewCollector()
|
||||
|
||||
c.Limit(&colly.LimitRule{
|
||||
DomainGlob: "*",
|
||||
RandomDelay: 2 * time.Second,
|
||||
})
|
||||
|
||||
c.OnRequest(func(r *colly.Request) {
|
||||
r.Headers.Set("User-Agent", userAgents[rand.Intn(len(userAgents))])
|
||||
r.Headers.Set("Referer", referers[rand.Intn(len(referers))])
|
||||
// By default, Colly adds the correct Accept-Encoding header.
|
||||
// We don't need to set it manually.
|
||||
})
|
||||
|
||||
// The OnResponse callback now just saves the body for later.
|
||||
c.OnResponse(func(r *colly.Response) {
|
||||
responseBody = r.Body
|
||||
})
|
||||
|
||||
foundBags := make(map[string]Bag)
|
||||
skuRegex := regexp.MustCompile(`-H\w+$`)
|
||||
|
||||
c.OnHTML("div.product-grid-list-item", func(e *colly.HTMLElement) {
|
||||
// ... HTML parsing logic remains the same ...
|
||||
idStr := e.Attr("id")
|
||||
skuMatch := skuRegex.FindString(idStr)
|
||||
if skuMatch == "" {
|
||||
return
|
||||
}
|
||||
sku := strings.TrimPrefix(skuMatch, "-")
|
||||
|
||||
unavailableIndicator := e.DOM.Find("span:contains('Unavailable')")
|
||||
isAvailable := unavailableIndicator.Length() == 0
|
||||
|
||||
bag := Bag{
|
||||
SKU: sku,
|
||||
Name: e.ChildText("span.product-title"),
|
||||
URL: e.Request.AbsoluteURL(e.ChildAttr("a.product-item-name", "href")),
|
||||
ImageURL: "https:" + e.ChildAttr("img[id^='img-']", "src"),
|
||||
Availability: isAvailable,
|
||||
}
|
||||
|
||||
if bag.Name != "" && bag.URL != "" {
|
||||
foundBags[sku] = bag
|
||||
}
|
||||
})
|
||||
|
||||
c.OnScraped(func(r *colly.Response) {
|
||||
// The response body is now guaranteed to be decompressed and readable.
|
||||
if isDebug, ok := ctx.Value(DebugContextKey).(bool); ok && isDebug {
|
||||
log.Printf("[DEBUG] Raw HTML response received:\n%s\n", string(responseBody))
|
||||
}
|
||||
|
||||
// Check for the block message on the readable content.
|
||||
if blockRegex.Match(responseBody) {
|
||||
blockedError = errors.New("request failed: hit anti-crawling wall (message found in page)")
|
||||
}
|
||||
})
|
||||
|
||||
c.OnError(func(r *colly.Response, err error) {
|
||||
log.Printf("❌ Colly request error for %s (Status: %d): %v", r.Request.URL, r.StatusCode, err)
|
||||
})
|
||||
|
||||
err := c.Visit(url)
|
||||
|
||||
if blockedError != nil {
|
||||
return nil, blockedError
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not complete visit to %s: %w", url, err)
|
||||
}
|
||||
|
||||
return foundBags, nil
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
package crawler_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"git.pengzhan.dev/aimaren/internal/crawler"
|
||||
)
|
||||
|
||||
const (
|
||||
// The live URL to be used for the end-to-end test.
|
||||
testURL = "https://www.hermes.com/us/en/category/women/bags-and-small-leather-goods/bags-and-clutches/"
|
||||
)
|
||||
|
||||
// TestScrape_EndToEnd performs a live test against the Hermès website.
|
||||
// NOTE: This test makes a real network request and may fail due to network issues,
|
||||
// IP blocking, or changes on the live website.
|
||||
func TestScrape_EndToEnd(t *testing.T) {
|
||||
t.Log("🚀 Starting end-to-end test against:", testURL)
|
||||
|
||||
// Requirement 1 & 2: Get HTML content and generate bags.
|
||||
// The Scrape function handles the HTTP request internally.
|
||||
// If the response is not 200 OK, it will return an error.
|
||||
bags, err := crawler.Scrape(context.WithValue(t.Context(), crawler.DebugContextKey, true), testURL)
|
||||
if err != nil {
|
||||
t.Fatalf("❌ Test Failed: The scrape function returned an error. This could be a network issue or a non-200 response from the server. Error: %v", err)
|
||||
}
|
||||
|
||||
// Requirement 2: Fail if the result is empty.
|
||||
if len(bags) == 0 {
|
||||
t.Fatalf("❌ Test Failed: Scraper found 0 items. The website's HTML structure has likely changed completely, or the request was blocked.")
|
||||
}
|
||||
|
||||
t.Logf("✅ Successfully scraped %d items. Performing data validation...", len(bags))
|
||||
|
||||
var availableCount, unavailableCount int
|
||||
var sampleBags []string
|
||||
|
||||
for sku, bag := range bags {
|
||||
// Requirement 4 (Others): Sanity check each parsed item.
|
||||
if bag.Name == "" {
|
||||
t.Errorf("❌ Test Failed: Bag with SKU %s has an empty Name.", sku)
|
||||
}
|
||||
if !strings.HasPrefix(bag.URL, "http") {
|
||||
t.Errorf("❌ Test Failed: Bag with SKU %s has an invalid URL: %s", sku, bag.URL)
|
||||
}
|
||||
|
||||
// Requirement 3: Count availability for health check.
|
||||
if bag.Availability {
|
||||
availableCount++
|
||||
} else {
|
||||
unavailableCount++
|
||||
}
|
||||
|
||||
// Collect a few samples for logging.
|
||||
if len(sampleBags) < 3 {
|
||||
sampleBags = append(sampleBags, fmt.Sprintf(" - %s (Available: %t)", bag.Name, bag.Availability))
|
||||
}
|
||||
}
|
||||
|
||||
// Log statistics for review.
|
||||
t.Logf("📊 Availability Stats: %d Available, %d Unavailable", availableCount, unavailableCount)
|
||||
t.Logf("✨ Sample Items:\n%s", strings.Join(sampleBags, "\n"))
|
||||
|
||||
// Requirement 3: Warn if availability is homogenous.
|
||||
if availableCount == 0 || unavailableCount == 0 {
|
||||
// This is a warning, not a failure. It flags a potential issue with the availability logic.
|
||||
t.Logf("⚠️ WARNING: All scraped bags have the same availability status. The 'unavailable' indicator on the website may have changed, causing our logic to be incorrect.")
|
||||
}
|
||||
|
||||
t.Log("✅ End-to-end test completed successfully.")
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
package crawler
|
||||
|
||||
import "time"
|
||||
|
||||
// Bag holds the metadata for a single product.
|
||||
type Bag struct {
|
||||
SKU string `firestore:"sku"`
|
||||
Name string `firestore:"name"`
|
||||
URL string `firestore:"url"`
|
||||
ImageURL string `firestore:"imageURL"`
|
||||
Availability bool `firestore:"availability"`
|
||||
CreatedTimestamp time.Time `firestore:"createdTimestamp,serverTimestamp"`
|
||||
UpdatedTimestamp time.Time `firestore:"updatedTimestamp,serverTimestamp"`
|
||||
DeleteTimestamp *time.Time `firestore:"deleteTimestamp,omitempty"`
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
package crawler
|
||||
|
||||
// Define a custom type for our context key to avoid collisions.
|
||||
type contextKey string
|
||||
|
||||
// DebugContextKey is the key for the debug flag in the context.
|
||||
const DebugContextKey contextKey = "debug"
|
||||
@@ -0,0 +1,120 @@
|
||||
package driver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"git.pengzhan.dev/aimaren/internal/crawler"
|
||||
"git.pengzhan.dev/aimaren/internal/notifier"
|
||||
"git.pengzhan.dev/aimaren/internal/storage"
|
||||
)
|
||||
|
||||
const hermesBagsURL = "https://www.hermes.com/us/en/category/women/bags-and-small-leather-goods/bags-and-clutches/"
|
||||
|
||||
// Driver orchestrates the crawling, state management, and notification process.
|
||||
type Driver struct {
|
||||
store storage.Storer
|
||||
notify notifier.Notifier
|
||||
}
|
||||
|
||||
func New(store storage.Storer, notify notifier.Notifier) *Driver {
|
||||
return &Driver{store: store, notify: notify}
|
||||
}
|
||||
|
||||
// Run now uses the consolidated AppState model.
|
||||
func (d *Driver) Run(ctx context.Context) error {
|
||||
log.Println("🚀 Kicking off new crawl cycle...")
|
||||
|
||||
scrapedBags, err := crawler.Scrape(ctx, hermesBagsURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("scraping failed: %w", err)
|
||||
}
|
||||
if len(scrapedBags) == 0 {
|
||||
return errors.New("scraper found 0 items, indicating a possible block or site change")
|
||||
}
|
||||
log.Printf("✅ Scraped %d items successfully.", len(scrapedBags))
|
||||
|
||||
// Fetch the entire application state in one call.
|
||||
appState, err := d.store.FetchAppState(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not fetch app state from storage: %w", err)
|
||||
}
|
||||
|
||||
// Process all changes, which modifies the appState object directly.
|
||||
hasChanges, notifications := d.processChanges(appState, scrapedBags)
|
||||
|
||||
// If there are notifications, broadcast to all chat IDs from the app state.
|
||||
if len(notifications) > 0 {
|
||||
if len(appState.ChatIDs) > 0 {
|
||||
log.Printf("✨ Found %d events. Broadcasting to %d subscribers...", len(notifications), len(appState.ChatIDs))
|
||||
for _, msg := range notifications {
|
||||
// The notifier now needs the list of IDs to send to.
|
||||
d.notify.Broadcast(appState.ChatIDs, msg)
|
||||
}
|
||||
} else {
|
||||
log.Println("✨ Found events, but no subscribers to notify.")
|
||||
}
|
||||
}
|
||||
|
||||
// If the state has changed, persist it back to storage.
|
||||
if hasChanges {
|
||||
log.Println("💾 State has changed. Updating storage...")
|
||||
if err := d.store.UpdateAppState(ctx, appState); err != nil {
|
||||
return fmt.Errorf("failed to update app state: %w", err)
|
||||
}
|
||||
log.Println("✅ Storage state updated successfully.")
|
||||
} else {
|
||||
log.Println("✨ No changes detected that require a state update.")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// processChanges now takes and modifies the AppState directly.
|
||||
func (d *Driver) processChanges(appState *storage.AppState, scrapedBags map[string]crawler.Bag) (bool, []string) {
|
||||
var notifications []string
|
||||
hasChanges := false
|
||||
|
||||
for sku, newBag := range scrapedBags {
|
||||
oldBag, exists := appState.Bags[sku]
|
||||
if !exists || (exists && oldBag.DeleteTimestamp != nil) {
|
||||
hasChanges = true
|
||||
msg := fmt.Sprintf("✨ NEW/RETURNED BAG ✨\n\nName: %s\nAvailable: %t\nURL: %s", newBag.Name, newBag.Availability, newBag.URL)
|
||||
notifications = append(notifications, msg)
|
||||
|
||||
newBag.CreatedTimestamp = time.Now()
|
||||
newBag.UpdatedTimestamp = time.Now()
|
||||
newBag.DeleteTimestamp = nil
|
||||
appState.Bags[sku] = newBag
|
||||
continue
|
||||
}
|
||||
if oldBag.Availability != newBag.Availability {
|
||||
hasChanges = true
|
||||
status := "In Stock!"
|
||||
if !newBag.Availability {
|
||||
status = "Sold Out"
|
||||
}
|
||||
msg := fmt.Sprintf("🚨 STOCK ALERT: %s 🚨\n\nName: %s \nURL: %s", status, newBag.Name, newBag.URL)
|
||||
notifications = append(notifications, msg)
|
||||
|
||||
oldBag.Availability = newBag.Availability
|
||||
oldBag.UpdatedTimestamp = time.Now()
|
||||
appState.Bags[sku] = oldBag
|
||||
}
|
||||
}
|
||||
|
||||
for sku, oldBag := range appState.Bags {
|
||||
if _, exists := scrapedBags[sku]; !exists && oldBag.DeleteTimestamp == nil {
|
||||
hasChanges = true
|
||||
now := time.Now()
|
||||
oldBag.DeleteTimestamp = &now
|
||||
appState.Bags[sku] = oldBag
|
||||
log.Printf("✅ Bag '%s' marked as removed.", oldBag.Name)
|
||||
}
|
||||
}
|
||||
|
||||
return hasChanges, notifications
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
package notifier
|
||||
|
||||
// Notifier defines the interface for sending notifications.
|
||||
type Notifier interface {
|
||||
// Broadcast sends a message to a given list of chat IDs.
|
||||
Broadcast(chatIDs []int64, message string) error
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"log"
|
||||
"sync"
|
||||
|
||||
"git.pengzhan.dev/aimaren/internal/config"
|
||||
|
||||
tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5"
|
||||
)
|
||||
|
||||
// TelegramNotifier no longer needs to store chat IDs.
|
||||
type TelegramNotifier struct {
|
||||
bot *tgbotapi.BotAPI
|
||||
}
|
||||
|
||||
// NewTelegramNotifier is now simpler and only requires the token.
|
||||
func NewTelegramNotifier(cfg config.Config) (*TelegramNotifier, error) {
|
||||
bot, err := tgbotapi.NewBotAPI(cfg.Telegram.Token)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &TelegramNotifier{bot: bot}, nil
|
||||
}
|
||||
|
||||
// Broadcast sends a message to all provided chat IDs in parallel.
|
||||
func (t *TelegramNotifier) Broadcast(chatIDs []int64, message string) error {
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for _, id := range chatIDs {
|
||||
wg.Add(1)
|
||||
// Launch a goroutine for each message to send them concurrently.
|
||||
go func(chatID int64) {
|
||||
defer wg.Done()
|
||||
msg := tgbotapi.NewMessage(chatID, message)
|
||||
if _, err := t.bot.Send(msg); err != nil {
|
||||
log.Printf("⚠️ Failed to send Telegram notification to chat ID %d: %v", chatID, err)
|
||||
}
|
||||
}(id)
|
||||
}
|
||||
|
||||
// Wait for all messages to be sent before returning.
|
||||
wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
// SendTo sends a message to a single, specific chat ID.
|
||||
// This is used by the bot-server for welcome messages.
|
||||
func (t *TelegramNotifier) SendTo(chatID int64, message string) error {
|
||||
msg := tgbotapi.NewMessage(chatID, message)
|
||||
_, err := t.bot.Send(msg)
|
||||
return err
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"cloud.google.com/go/firestore"
|
||||
"git.pengzhan.dev/aimaren/internal/crawler"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
)
|
||||
|
||||
const (
|
||||
stateCollection = "hermes_state"
|
||||
stateDocument = "main"
|
||||
)
|
||||
|
||||
type FirestoreClient struct {
|
||||
client *firestore.Client
|
||||
}
|
||||
|
||||
func NewFirestoreClient(ctx context.Context, projectID string) (*FirestoreClient, error) {
|
||||
client, err := firestore.NewClient(ctx, projectID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &FirestoreClient{client: client}, nil
|
||||
}
|
||||
|
||||
// FetchAppState retrieves the entire application state from a single document.
|
||||
func (fs *FirestoreClient) FetchAppState(ctx context.Context) (*AppState, error) {
|
||||
doc, err := fs.client.Collection(stateCollection).Doc(stateDocument).Get(ctx)
|
||||
if err != nil {
|
||||
// If the doc doesn't exist, return a new, empty AppState.
|
||||
if status.Code(err) == codes.NotFound {
|
||||
return &AppState{
|
||||
Bags: make(map[string]crawler.Bag),
|
||||
ChatIDs: []int64{},
|
||||
}, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var state AppState
|
||||
if err := doc.DataTo(&state); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &state, nil
|
||||
}
|
||||
|
||||
// UpdateAppState writes the entire application state back to the document.
|
||||
func (fs *FirestoreClient) UpdateAppState(ctx context.Context, newState *AppState) error {
|
||||
_, err := fs.client.Collection(stateCollection).Doc(stateDocument).Set(ctx, newState)
|
||||
return err
|
||||
}
|
||||
|
||||
// AddChatID atomically adds a new chat ID to the list in the main state document.
|
||||
func (fs *FirestoreClient) AddChatID(ctx context.Context, chatID int64) error {
|
||||
docRef := fs.client.Collection(stateCollection).Doc(stateDocument)
|
||||
_, err := docRef.Update(ctx, []firestore.Update{
|
||||
{Path: "chat_ids", Value: firestore.ArrayUnion(chatID)},
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
func (fs *FirestoreClient) Close() {
|
||||
fs.client.Close()
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"git.pengzhan.dev/aimaren/internal/crawler"
|
||||
)
|
||||
|
||||
// AppState represents the entire state of the application stored in Firestore.
|
||||
type AppState struct {
|
||||
Bags map[string]crawler.Bag `firestore:"bags"`
|
||||
ChatIDs []int64 `firestore:"chat_ids"`
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"context"
|
||||
)
|
||||
|
||||
// Storer defines the interface for database operations.
|
||||
type Storer interface {
|
||||
FetchAppState(ctx context.Context) (*AppState, error)
|
||||
UpdateAppState(ctx context.Context, newState *AppState) error
|
||||
AddChatID(ctx context.Context, chatID int64) error
|
||||
}
|
||||
Reference in New Issue
Block a user