-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodels.py
More file actions
137 lines (109 loc) · 3.53 KB
/
Copy pathmodels.py
File metadata and controls
137 lines (109 loc) · 3.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""Data models for Marktplaats scraper."""
import csv
import json
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from typing import Optional
@dataclass
class Listing:
"""Represents a single Marktplaats listing."""
title: str
price: str
description: str
location: str
listing_url: str
category: str
condition: str = ""
shipping: str = ""
seller_name: str = ""
date_posted: str = ""
scraped_at: str = field(
default_factory=lambda: datetime.now(timezone.utc).isoformat()
)
def to_dict(self) -> dict:
"""Convert listing to dictionary."""
return asdict(self)
def __hash__(self):
"""Hash based on unique listing URL."""
return hash(self.listing_url)
def __eq__(self, other):
"""Two listings are equal if they have the same URL."""
if not isinstance(other, Listing):
return False
return self.listing_url == other.listing_url
class ListingCollection:
"""Collection of listings with deduplication and export functionality."""
def __init__(self):
self._listings: dict[str, Listing] = {}
def add(self, listing: Listing) -> bool:
"""
Add a listing to the collection.
Returns True if listing was added (new), False if duplicate.
"""
if listing.listing_url in self._listings:
return False
self._listings[listing.listing_url] = listing
return True
def add_many(self, listings: list[Listing]) -> int:
"""
Add multiple listings to the collection.
Returns the number of new listings added.
"""
added = 0
for listing in listings:
if self.add(listing):
added += 1
return added
@property
def listings(self) -> list[Listing]:
"""Get all listings as a list."""
return list(self._listings.values())
def __len__(self) -> int:
return len(self._listings)
def __iter__(self):
return iter(self._listings.values())
def to_csv(self, filepath: str) -> str:
"""
Export listings to CSV file.
Args:
filepath: Path to the output CSV file.
Returns:
The filepath where data was saved.
"""
if not self._listings:
return filepath
fieldnames = [
"title",
"price",
"description",
"location",
"listing_url",
"category",
"condition",
"shipping",
"seller_name",
"date_posted",
"scraped_at",
]
with open(filepath, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for listing in self._listings.values():
writer.writerow(listing.to_dict())
return filepath
def to_json(self, filepath: str) -> str:
"""
Export listings to JSON file.
Args:
filepath: Path to the output JSON file.
Returns:
The filepath where data was saved.
"""
data = {
"scraped_at": datetime.now(timezone.utc).isoformat(),
"total_listings": len(self._listings),
"listings": [listing.to_dict() for listing in self._listings.values()],
}
with open(filepath, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return filepath