crawler-buddy/crawleeplaywright.py at main · rumca-js/crawler-buddy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
I wanted to use crawlee directly in my project
 - I am running code in a thread (not a main thread)
 - crawlee uses asyncio, therefore I also have to use it
 - I tried creating my own loop in a thread, and on windows such system works
 - on linux raspberry it does not. Asyncio does not allow to define new loop from task
    set_wakeup_fd only works in main thread of the main interpreter

 - full asyncio http server also does not work, as only first request works, then crawlee
   complains that not all async tasks have been completed.
   No joke, asyncio http server has not completed, therefore it cannot work together

Therefore crawlee is called from a separate script. We cut off crawlee.
"""

import argparse
import sys
import os
from datetime import timedelta
import json
from src import webtools
import traceback
import shutil

from webtoolkit import (
   PageResponseObject, response_to_file,
   HTTP_STATUS_CODE_EXCEPTION,
)

os.environ["CRAWLEE_STORAGE_DIR"] = "./storage/{}".format(os.getpid())


def cleanup_storage():
    path = os.environ["CRAWLEE_STORAGE_DIR"]
    # cannot remove it yet, when program is running :(
    # shutil.rmtree(path)


crawlee_feataure_enabled = True
try:
    # https://github.com/apify/crawlee-python
    # https://crawlee.dev/python/api
    import asyncio
    from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
except Exception as E:
    print(str(E))
    print("Make sure you have crawlee with playwright extra")
    crawlee_feataure_enabled = False


def on_close(interface, response):
    interface.response = response

    file = "response.txt"
    response_to_file(response, file)

    cleanup_storage()
    # crawlee complains if we kill it like this sys.exit(0)


async def main() -> None:
    webtools.WebConfig.use_print_logging()

    parser = webtools.ScriptCrawlerParser()
    parser.parse()
    if not parser.is_valid():
        sys.exit(1)
        return

    if not crawlee_feataure_enabled:
        print("Python: crawlee package is not available")
        sys.exit(1)
        return

    request = parser.get_request()
    if parser.args.verbose:
        print("Running request:{} with PlaywrightCrawler".format(request))

    interface = webtools.ScriptCrawlerInterface(
        parser, request, __file__, webtools.webconfig.CrawleePlaywrightScript(request.url).script
    )
    response = PageResponseObject(request.url)

    if parser.args.proxy_address:
        proxy_config = ProxyConfiguration(
            proxy_urls=[parser.args.proxy_address],
        )
        crawler = PlaywrightCrawler(
            proxy_configuration=proxy_config,
            # Limit the crawl to max requests. Remove or increase it for crawling all links.
            max_requests_per_crawl=10,
            request_handler_timeout=timedelta(seconds=request.timeout_s),
        )
    else:
        crawler = PlaywrightCrawler(
            # Limit the crawl to max requests. Remove or increase it for crawling all links.
            max_requests_per_crawl=10,
            request_handler_timeout=timedelta(seconds=request.timeout_s),
        )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        print(f"Processing {context.request.url} ...")

        try:
            # maybe we could send header information that we accept text/rss

            headers = {}
            for item in context.response.headers:
                headers[item] = context.response.headers[item]

            response.url = context.request.loaded_url
            response.request_url = request.url

            # result['loaded_url'] = context.page.url
            response.status_code = context.response.status
            response.set_headers(headers)

            interface.response = response

            if request.ping:
                on_close(interface, response)
                return

            if request.headers:
                on_close(interface, response)
                return

            if not interface.is_response_valid():
                on_close(interface, response)
                return

            response.set_text(await context.page.content())

            print(f"Processing {context.request.url} ...DONE")

            on_close(interface, response)
            return

        except Exception as E:
            print(str(E))
            error_text = traceback.format_exc()
            print(error_text)

            response.status_code = webtools.HTTP_STATUS_CODE_EXCEPTION
            on_close(interface, response)
            return

    try:
        # Run the crawler with the initial list of URLs.
        await crawler.run([parser.args.url])
    except Exception as E:
        print(str(E))
        error_text = traceback.format_exc()
        print(error_text)

        response.status_code = webtools.HTTP_STATUS_CODE_EXCEPTION
        on_close(interface, response)
        return


if __name__ == "__main__":
    asyncio.run(main())