marktplaats-scraper/utils.py at main · kami4ka/marktplaats-scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
"""Utility functions for Marktplaats scraper."""

import re
from typing import Optional, Tuple


def clean_text(text: str) -> str:
    """Clean and normalize text by removing extra whitespace."""
    if not text:
        return ""
    # Normalize whitespace
    text = " ".join(text.split())
    return text.strip()


def extract_price(text: str) -> str:
    """
    Extract price from listing text.

    Args:
        text: Text containing price (e.g., "€ 99,95", "€ 1.250,00")

    Returns:
        Formatted price string or empty string if not found.
    """
    if not text:
        return ""

    # Match price patterns: € 99,95 or € 1.250,00
    price_match = re.search(r"€\s*(\d+(?:[.,]\d+)*)", text)
    if price_match:
        return f"€ {price_match.group(1)}"

    # Check for "Bieden" (bidding) or "Gratis" (free)
    text_lower = text.lower()
    if "bieden" in text_lower:
        return "Bieden"
    if "gratis" in text_lower:
        return "Gratis"

    return ""


def extract_condition(text: str) -> str:
    """
    Extract condition from listing text.

    Args:
        text: Full text from listing

    Returns:
        Condition string or empty string if not found.
    """
    if not text:
        return ""

    text_lower = text.lower()

    conditions = [
        ("nieuw", "Nieuw"),
        ("zo goed als nieuw", "Zo goed als nieuw"),
        ("gebruikt", "Gebruikt"),
        ("refurbished", "Refurbished"),
    ]

    for keyword, label in conditions:
        if keyword in text_lower:
            return label

    return ""


def extract_shipping(text: str) -> str:
    """
    Extract shipping info from listing text.

    Args:
        text: Full text from listing

    Returns:
        Shipping info string.
    """
    if not text:
        return ""

    text_lower = text.lower()

    if "ophalen of verzenden" in text_lower:
        return "Ophalen of Verzenden"
    elif "verzenden" in text_lower:
        return "Verzenden"
    elif "ophalen" in text_lower:
        return "Ophalen"

    return ""


def extract_date(text: str) -> str:
    """
    Extract posting date from listing text.

    Args:
        text: Full text from listing

    Returns:
        Date string or empty string if not found.
    """
    if not text:
        return ""

    # Check for common date patterns in Dutch
    if "vandaag" in text.lower():
        return "Vandaag"
    if "gisteren" in text.lower():
        return "Gisteren"

    # Match date patterns like "12 jan" or "5 feb"
    date_match = re.search(
        r"(\d{1,2})\s*(jan|feb|mrt|apr|mei|jun|jul|aug|sep|okt|nov|dec)",
        text,
        re.I
    )
    if date_match:
        return f"{date_match.group(1)} {date_match.group(2).lower()}"

    return ""


def parse_listing_text(text: str, title: str = "") -> dict:
    """
    Parse listing card text into structured fields.

    Args:
        text: Full text content from listing card.
        title: Title extracted from h3 element.

    Returns:
        Dictionary with parsed fields.
    """
    result = {
        "title": clean_text(title),
        "price": "",
        "description": "",
        "location": "",
        "condition": "",
        "shipping": "",
        "seller_name": "",
        "date_posted": "",
    }

    if not text:
        return result

    text = clean_text(text)

    # Extract price
    result["price"] = extract_price(text)

    # Extract condition
    result["condition"] = extract_condition(text)

    # Extract shipping info
    result["shipping"] = extract_shipping(text)

    # Extract date
    result["date_posted"] = extract_date(text)

    # Try to extract description
    # Description usually comes after the price and before condition/shipping
    if result["title"] and result["price"]:
        # Find text between title and price, or after price
        title_end = text.find(result["title"]) + len(result["title"])
        price_start = text.find("€")
        if price_start > title_end:
            desc = text[title_end:price_start].strip()
        else:
            # Look for "details" keyword which often precedes description
            details_match = re.search(r"details(.{10,200}?)(?:Nieuw|Gebruikt|Verzenden|Ophalen|€)", text, re.I)
            if details_match:
                desc = details_match.group(1).strip()
            else:
                desc = ""
        result["description"] = clean_text(desc)[:500]

    return result


def build_listing_url(base_url: str, path: str) -> str:
    """
    Build full listing URL from base URL and path.

    Args:
        base_url: Base Marktplaats URL (e.g., "https://www.marktplaats.nl")
        path: Item path (e.g., "/v/computers/a1234567-item")

    Returns:
        Full URL to the listing.
    """
    if not path:
        return ""
    # Remove query parameters
    path = path.split("?")[0]
    if path.startswith("http"):
        return path.split("?")[0]
    if not path.startswith("/"):
        path = "/" + path
    return f"{base_url.rstrip('/')}{path}"


def build_category_url(base_url: str, category: str, page: int = 1) -> str:
    """
    Build full category URL with optional pagination.

    Args:
        base_url: Base Marktplaats URL.
        category: Category slug (e.g., "computers-en-software/windows-laptops")
        page: Page number (1-indexed)

    Returns:
        Full URL to the category page.
    """
    if not category:
        return base_url
    if not category.startswith("/"):
        category = "/" + category

    # Build URL with pagination
    if page > 1:
        return f"{base_url.rstrip('/')}/l{category}/p/{page}/"
    return f"{base_url.rstrip('/')}/l{category}/"


def extract_listing_id(url: str) -> Optional[str]:
    """
    Extract listing ID from URL.

    Args:
        url: Listing URL (e.g., "/v/computers/a1234567-item-name")

    Returns:
        Listing ID or None if not found.
    """
    if not url:
        return None
    # Pattern: /v/category/subcategory/a{id}-{slug}
    match = re.search(r"/a(\d+)-", url)
    return match.group(1) if match else None


def format_price(price: str) -> str:
    """
    Format price string consistently.

    Args:
        price: Price string (e.g., "€99,95", "€ 99,95")

    Returns:
        Consistently formatted price string.
    """
    if not price:
        return ""
    price = clean_text(price)
    # Ensure space after €
    price = re.sub(r"€(\d)", r"€ \1", price)
    # Remove "details" suffix if present
    price = re.sub(r"details.*$", "", price, flags=re.I).strip()
    return price


def truncate_text(text: str, max_length: int = 200) -> str:
    """
    Truncate text to maximum length with ellipsis.

    Args:
        text: Text to truncate.
        max_length: Maximum length including ellipsis.

    Returns:
        Truncated text.
    """
    if not text or len(text) <= max_length:
        return text
    return text[: max_length - 3].rsplit(" ", 1)[0] + "..."