KingsLandingJobScraper/scraper.py

83 lines
2.4 KiB
Python
Raw Permalink Normal View History

2023-01-24 15:01:58 -04:00
import time
import urllib.request
import urllib.error
2023-01-24 15:17:49 -04:00
import sys
2023-01-24 15:01:58 -04:00
from bs4 import BeautifulSoup
# Made for a friend with love
# Tested with Python 3.10.7 on Windows 10
2023-01-25 23:11:21 -04:00
# TODO: Look into using requests, urllib3, and/or selenium to scrape the page
# https://pypi.org/project/selenium/
# https://pypi.org/project/urllib3/
# https://pypi.org/project/requests/
2023-01-24 15:01:58 -04:00
def scrape():
contents = urllib.request.urlopen("https://kingslanding.nb.ca/employment/").read()
soup = BeautifulSoup(contents, "html.parser")
# Other ways to parse the HTML to check for differences
# div_id = "wrapper"
# whole_page = soup.find("div", {"id": div_id})
# main_id = "main"
# main_content = whole_page.find("main", {"id": main_id})
text_id = "content-holder"
text_content = soup.find("div", {"class": text_id})
return text_content.text
if __name__ == '__main__':
2023-01-31 19:55:09 -04:00
testing = False
will_exit = False
2023-01-24 15:17:49 -04:00
hours = 3
arg_count = len(sys.argv)
if arg_count == 1: # No arguments
2023-01-24 15:17:49 -04:00
print("No arguments given, using default value of 3 hours")
else:
try:
hours = int(sys.argv[1]) # First argument is the script name, second is the first argument
print("Using argument value of " + str(hours) + " hours")
except ValueError:
print("Invalid argument given, using default value of 3 hours")
2023-01-24 15:01:58 -04:00
# start = time.time()
# 60 seconds * 60 minutes * 3 hours
2023-01-24 15:17:49 -04:00
timer = 60 * 60 * hours
2023-01-24 15:01:58 -04:00
text = scrape()
2023-01-24 15:17:49 -04:00
print("Started checking for new jobs")
2023-01-24 15:01:58 -04:00
while True:
try:
tmp = scrape()
if text != tmp or testing:
print("\a")
2023-01-24 15:01:58 -04:00
print("New job posting! Here's the text:")
print(tmp)
print("And here's the link: https://kingslanding.nb.ca/employment/")
print("Do you want to keep checking? (y/n): ", end="")
choice = input()
if choice == "n" or choice == "N":
will_exit = True
2023-01-24 15:01:58 -04:00
else:
text = tmp
except urllib.error.URLError as err:
print("An error occurred when scraping: {}".format(err.reason))
except Exception as err:
print("An unknown error has occurred: {}".format(err))
finally:
if will_exit:
exit(0)
else:
time.sleep(timer)