main
1// Package models defines data structures used throughout gowarcprox.
2//
3// The primary type is RecordedURL, which captures all information about
4// an HTTP request/response pair as it flows through the proxy. This includes:
5// - Request: URL, method, headers, body
6// - Response: status, headers, body
7// - Metadata: timing, addresses, digests, content info
8// - Warcprox-Meta: optional per-request configuration from client
9//
10// The package also defines supporting types for deduplication (DedupInfo),
11// statistics bucketing (StatsBucketDef), and blocking rules (BlockRule).
12//
13// RecordedURL objects are created by the proxy handler, flow through the
14// pipeline, and are consumed by processors like the WARC writer and
15// deduplication checker.
16package models
17
18import (
19 "net/http"
20 "time"
21)
22
23// RecordedURL represents a captured HTTP request/response pair
24type RecordedURL struct {
25 // Request information
26 URL string
27 Method string
28 RequestHeader http.Header
29 RequestBody []byte
30
31 // Response information
32 StatusCode int
33 StatusMessage string
34 ResponseHeader http.Header
35 ResponseBody []byte
36
37 // Timing information
38 Timestamp time.Time
39 Duration time.Duration
40
41 // Remote server information
42 RemoteAddr string
43 RemoteIP string
44
45 // Client information
46 ClientAddr string
47 ClientIP string
48
49 // Digest information
50 PayloadDigest string // e.g., "sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A"
51 BlockDigest string // Digest of the entire HTTP response block
52
53 // Content information
54 ContentType string
55 ContentLength int64
56
57 // Deduplication information
58 DedupInfo *DedupInfo
59
60 // Warcprox-Meta information
61 WarcproxMeta *WarcproxMeta
62
63 // WARC information
64 WARCRecordID string // UUID of the WARC record
65}
66
67// DedupInfo contains information about a deduplicated resource
68type DedupInfo struct {
69 // The WARC record ID of the original capture
70 RecordID string
71
72 // The URL of the original capture
73 URL string
74
75 // The date of the original capture
76 Date time.Time
77
78 // Whether this was found via read-only dedup bucket
79 ReadOnly bool
80}
81
82// WarcproxMeta contains per-request configuration from the Warcprox-Meta header
83type WarcproxMeta struct {
84 // Custom WARC prefix for this request
85 WarcPrefix string
86
87 // Dedup buckets: map of bucket name to mode ("ro" or "rw")
88 DedupBuckets map[string]string
89
90 // Stats buckets for tracking statistics
91 StatsBuckets []string
92
93 // Stats bucket definitions with domain tallying
94 StatsBucketDefs []StatsBucketDef
95
96 // Hard limits (420 response when exceeded)
97 Limits map[string]int64
98
99 // Soft limits (430 response when exceeded)
100 SoftLimits map[string]int64
101
102 // URL blocking rules
103 Blocks []BlockRule
104
105 // Metadata to include in WARC records
106 Metadata map[string]interface{}
107
108 // Accept flags for response metadata
109 Accept map[string]bool
110}
111
112// StatsBucketDef defines a stats bucket with optional domain tallying
113type StatsBucketDef struct {
114 Bucket string
115 TallyDomains []string
116}
117
118// BlockRule defines a URL blocking rule
119type BlockRule struct {
120 Domain string
121 SURT string
122 Regex string
123}
124
125// IsDedup returns true if this URL was found in the dedup database
126func (ru *RecordedURL) IsDedup() bool {
127 return ru.DedupInfo != nil
128}
129
130// GetWarcPrefix returns the WARC prefix to use for this URL
131func (ru *RecordedURL) GetWarcPrefix(defaultPrefix string) string {
132 if ru.WarcproxMeta != nil && ru.WarcproxMeta.WarcPrefix != "" {
133 return ru.WarcproxMeta.WarcPrefix
134 }
135 return defaultPrefix
136}
137
138// GetDedupBuckets returns the dedup buckets for this URL
139func (ru *RecordedURL) GetDedupBuckets() map[string]string {
140 if ru.WarcproxMeta != nil && ru.WarcproxMeta.DedupBuckets != nil {
141 return ru.WarcproxMeta.DedupBuckets
142 }
143 // Return empty bucket (default dedup bucket)
144 return map[string]string{"": "rw"}
145}
146
147// GetStatsBuckets returns the stats buckets for this URL
148func (ru *RecordedURL) GetStatsBuckets() []string {
149 if ru.WarcproxMeta == nil {
150 return nil
151 }
152
153 // Collect bucket names from both simple buckets and bucket definitions
154 buckets := make([]string, 0)
155
156 if ru.WarcproxMeta.StatsBuckets != nil {
157 buckets = append(buckets, ru.WarcproxMeta.StatsBuckets...)
158 }
159
160 for _, def := range ru.WarcproxMeta.StatsBucketDefs {
161 buckets = append(buckets, def.Bucket)
162 }
163
164 return buckets
165}
166
167// GetStatsBucketDefs returns the stats bucket definitions for this URL
168func (ru *RecordedURL) GetStatsBucketDefs() []StatsBucketDef {
169 if ru.WarcproxMeta != nil {
170 return ru.WarcproxMeta.StatsBucketDefs
171 }
172 return nil
173}
174
175// ShouldTallyDomain checks if domain tallying is enabled for the given bucket
176func (ru *RecordedURL) ShouldTallyDomain(bucket string) (bool, []string) {
177 if ru.WarcproxMeta == nil {
178 return false, nil
179 }
180
181 for _, def := range ru.WarcproxMeta.StatsBucketDefs {
182 if def.Bucket == bucket && len(def.TallyDomains) > 0 {
183 return true, def.TallyDomains
184 }
185 }
186
187 return false, nil
188}