-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_getter.py
70 lines (57 loc) · 2.54 KB
/
data_getter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
from dotenv import load_dotenv
def generate_wordcloud(df):
text = ' '.join(df['column_name'].astype(str).tolist())
wordcloud = WordCloud(width=800, height=400).generate(text)
plt.figure(figsize=(15, 7.5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
def login_rit_site(driver, username, password):
driver.get(os.getenv('LOGIN_URL')) # Update with environment variable
time.sleep(5) # Wait for the login page to load
# Enter username
username_field = driver.find_element(By.ID, "ritUsername") # Update with actual field ID
username_field.send_keys(username)
# Enter password
password_field = driver.find_element(By.ID, "ritPassword") # Update with actual field ID
password_field.send_keys(password)
password_field.send_keys(Keys.RETURN)
time.sleep(10) # Wait for 2FA to complete
def scrape_rit_jobs(query, num_pages=5, username='', password=''):
driver = webdriver.Chrome()
login_rit_site(driver, username, password)
job_descriptions = []
for page in range(num_pages):
url = f"https://rit-csm.symplicity.com/students/app/jobs/search?perPage=1&page={page}&sort=!kwmatch&keywords=software%2520engineer¤tJobId=9044d386ecc1f2f122daa296774c22e5"
driver.get(url)
time.sleep(5) # Wait for the page to load
soup = BeautifulSoup(driver.page_source, 'html.parser')
# job_cards = soup.find_all('div', class_='job-card-class') # Update with actual class name
desc = soup.find('div', class_='form-col no-padding') # Update with actual class name
if desc:
job_descriptions.append(desc.get_text(separator=' ', strip=True))
driver.quit()
df = pd.DataFrame({'job_description': job_descriptions})
return df
def main():
load_dotenv()
username = os.getenv('KUSERNAME') # Replace with environment variable
# print(username)
password = os.getenv('PASSWORD') # Replace with environment variable
# print(password)
df = scrape_rit_jobs("Software Engineer", username=username, password=password, num_pages=5)
# Save df to a CSV file
df.to_csv('rit_jobs2.csv', index=True)
# generate_wordcloud(df)
if __name__ == "__main__":
main()