-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathweb_scraping.py
More file actions
45 lines (38 loc) · 1.19 KB
/
Copy pathweb_scraping.py
File metadata and controls
45 lines (38 loc) · 1.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
In this script scrape Data From this web page,
https://noonies.tech/award/top-programming-guru
the data collected is stored table of three columns
the channel Name, the channel url and the rank
"""
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
chrome_options = Options()
chrome_options.add_argument("--headless")
url = 'https://noonies.tech/award/top-programming-guru'
path = '/home/zaki/Downloads/chromedriver_linux64/chromedriver' # path to chromedriver
driver = webdriver.Chrome(path, options=chrome_options)
driver.get(url)
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
results = soup.find_all(href=re.compile('youtube.com'))
# print(results)
data = {'channelName': [], 'url': []}
for c in tqdm(results):
# print(c)
# print(c.text)
# print(c.get('href'))
data['channelName'].append(c.text)
data['url'].append(c.get('href'))
# break
df = pd.DataFrame.from_dict(data)
# adding ranking column
ranking = df.index + 1
df['rank'] = ranking
print(df.head())
df.to_csv('data/top-programming-guru.csv', index=False)