main
Raw Download raw file
  1// Package models defines data structures used throughout gowarcprox.
  2//
  3// The primary type is RecordedURL, which captures all information about
  4// an HTTP request/response pair as it flows through the proxy. This includes:
  5//   - Request: URL, method, headers, body
  6//   - Response: status, headers, body
  7//   - Metadata: timing, addresses, digests, content info
  8//   - Warcprox-Meta: optional per-request configuration from client
  9//
 10// The package also defines supporting types for deduplication (DedupInfo),
 11// statistics bucketing (StatsBucketDef), and blocking rules (BlockRule).
 12//
 13// RecordedURL objects are created by the proxy handler, flow through the
 14// pipeline, and are consumed by processors like the WARC writer and
 15// deduplication checker.
 16package models
 17
 18import (
 19	"net/http"
 20	"time"
 21)
 22
 23// RecordedURL represents a captured HTTP request/response pair
 24type RecordedURL struct {
 25	// Request information
 26	URL           string
 27	Method        string
 28	RequestHeader http.Header
 29	RequestBody   []byte
 30
 31	// Response information
 32	StatusCode     int
 33	StatusMessage  string
 34	ResponseHeader http.Header
 35	ResponseBody   []byte
 36
 37	// Timing information
 38	Timestamp  time.Time
 39	Duration   time.Duration
 40
 41	// Remote server information
 42	RemoteAddr string
 43	RemoteIP   string
 44
 45	// Client information
 46	ClientAddr string
 47	ClientIP   string
 48
 49	// Digest information
 50	PayloadDigest string // e.g., "sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A"
 51	BlockDigest   string // Digest of the entire HTTP response block
 52
 53	// Content information
 54	ContentType   string
 55	ContentLength int64
 56
 57	// Deduplication information
 58	DedupInfo *DedupInfo
 59
 60	// Warcprox-Meta information
 61	WarcproxMeta *WarcproxMeta
 62
 63	// WARC information
 64	WARCRecordID string // UUID of the WARC record
 65}
 66
 67// DedupInfo contains information about a deduplicated resource
 68type DedupInfo struct {
 69	// The WARC record ID of the original capture
 70	RecordID string
 71
 72	// The URL of the original capture
 73	URL string
 74
 75	// The date of the original capture
 76	Date time.Time
 77
 78	// Whether this was found via read-only dedup bucket
 79	ReadOnly bool
 80}
 81
 82// WarcproxMeta contains per-request configuration from the Warcprox-Meta header
 83type WarcproxMeta struct {
 84	// Custom WARC prefix for this request
 85	WarcPrefix string
 86
 87	// Dedup buckets: map of bucket name to mode ("ro" or "rw")
 88	DedupBuckets map[string]string
 89
 90	// Stats buckets for tracking statistics
 91	StatsBuckets []string
 92
 93	// Stats bucket definitions with domain tallying
 94	StatsBucketDefs []StatsBucketDef
 95
 96	// Hard limits (420 response when exceeded)
 97	Limits map[string]int64
 98
 99	// Soft limits (430 response when exceeded)
100	SoftLimits map[string]int64
101
102	// URL blocking rules
103	Blocks []BlockRule
104
105	// Metadata to include in WARC records
106	Metadata map[string]interface{}
107
108	// Accept flags for response metadata
109	Accept map[string]bool
110}
111
112// StatsBucketDef defines a stats bucket with optional domain tallying
113type StatsBucketDef struct {
114	Bucket       string
115	TallyDomains []string
116}
117
118// BlockRule defines a URL blocking rule
119type BlockRule struct {
120	Domain string
121	SURT   string
122	Regex  string
123}
124
125// IsDedup returns true if this URL was found in the dedup database
126func (ru *RecordedURL) IsDedup() bool {
127	return ru.DedupInfo != nil
128}
129
130// GetWarcPrefix returns the WARC prefix to use for this URL
131func (ru *RecordedURL) GetWarcPrefix(defaultPrefix string) string {
132	if ru.WarcproxMeta != nil && ru.WarcproxMeta.WarcPrefix != "" {
133		return ru.WarcproxMeta.WarcPrefix
134	}
135	return defaultPrefix
136}
137
138// GetDedupBuckets returns the dedup buckets for this URL
139func (ru *RecordedURL) GetDedupBuckets() map[string]string {
140	if ru.WarcproxMeta != nil && ru.WarcproxMeta.DedupBuckets != nil {
141		return ru.WarcproxMeta.DedupBuckets
142	}
143	// Return empty bucket (default dedup bucket)
144	return map[string]string{"": "rw"}
145}
146
147// GetStatsBuckets returns the stats buckets for this URL
148func (ru *RecordedURL) GetStatsBuckets() []string {
149	if ru.WarcproxMeta == nil {
150		return nil
151	}
152
153	// Collect bucket names from both simple buckets and bucket definitions
154	buckets := make([]string, 0)
155
156	if ru.WarcproxMeta.StatsBuckets != nil {
157		buckets = append(buckets, ru.WarcproxMeta.StatsBuckets...)
158	}
159
160	for _, def := range ru.WarcproxMeta.StatsBucketDefs {
161		buckets = append(buckets, def.Bucket)
162	}
163
164	return buckets
165}
166
167// GetStatsBucketDefs returns the stats bucket definitions for this URL
168func (ru *RecordedURL) GetStatsBucketDefs() []StatsBucketDef {
169	if ru.WarcproxMeta != nil {
170		return ru.WarcproxMeta.StatsBucketDefs
171	}
172	return nil
173}
174
175// ShouldTallyDomain checks if domain tallying is enabled for the given bucket
176func (ru *RecordedURL) ShouldTallyDomain(bucket string) (bool, []string) {
177	if ru.WarcproxMeta == nil {
178		return false, nil
179	}
180
181	for _, def := range ru.WarcproxMeta.StatsBucketDefs {
182		if def.Bucket == bucket && len(def.TallyDomains) > 0 {
183			return true, def.TallyDomains
184		}
185	}
186
187	return false, nil
188}