KingsLandingJobScraper/scraper.py

import time
import urllib.request
import urllib.error
import sys

from bs4 import BeautifulSoup


# Made for a friend with love
# Tested with Python 3.10.7 on Windows 10

# TODO: Look into using requests, urllib3, and/or selenium to scrape the page
# https://pypi.org/project/selenium/
# https://pypi.org/project/urllib3/
# https://pypi.org/project/requests/

def scrape():
    contents = urllib.request.urlopen("https://kingslanding.nb.ca/employment/").read()
    soup = BeautifulSoup(contents, "html.parser")

    # Other ways to parse the HTML to check for differences

    # div_id = "wrapper"
    # whole_page = soup.find("div", {"id": div_id})

    # main_id = "main"
    # main_content = whole_page.find("main", {"id": main_id})

    text_id = "content-holder"
    text_content = soup.find("div", {"class": text_id})

    return text_content.text


if __name__ == '__main__':
    testing = False
    will_exit = False

    hours = 3

    arg_count = len(sys.argv)
    if arg_count == 1:  # No arguments
        print("No arguments given, using default value of 3 hours")
    else:
        try:
            hours = int(sys.argv[1])  # First argument is the script name, second is the first argument
            print("Using argument value of " + str(hours) + " hours")
        except ValueError:
            print("Invalid argument given, using default value of 3 hours")

    # start = time.time()

    # 60 seconds * 60 minutes * 3 hours
    timer = 60 * 60 * hours
    text = scrape()
    print("Started checking for new jobs")

    while True:
        try:
            tmp = scrape()
            if text != tmp or testing:
                print("\a")
                print("New job posting! Here's the text:")
                print(tmp)
                print("And here's the link: https://kingslanding.nb.ca/employment/")
                print("Do you want to keep checking? (y/n): ", end="")
                choice = input()
                if choice == "n" or choice == "N":
                    will_exit = True
                else:
                    text = tmp
        except urllib.error.URLError as err:
            print("An error occurred when scraping: {}".format(err.reason))

        except Exception as err:
            print("An unknown error has occurred: {}".format(err))

        finally:
            if will_exit:
                exit(0)
            else:
                time.sleep(timer)
Initial commit 2023-01-24 15:01:58 -04:00			`import time`
			`import urllib.request`
			`import urllib.error`
Add way to change hour interval 2023-01-24 15:17:49 -04:00			`import sys`
Initial commit 2023-01-24 15:01:58 -04:00
			`from bs4 import BeautifulSoup`


			`# Made for a friend with love`
			`# Tested with Python 3.10.7 on Windows 10`

Add comments about improvments 2023-01-25 23:11:21 -04:00			`# TODO: Look into using requests, urllib3, and/or selenium to scrape the page`
			`# https://pypi.org/project/selenium/`
			`# https://pypi.org/project/urllib3/`
			`# https://pypi.org/project/requests/`

Initial commit 2023-01-24 15:01:58 -04:00			`def scrape():`
			`contents = urllib.request.urlopen("https://kingslanding.nb.ca/employment/").read()`
			`soup = BeautifulSoup(contents, "html.parser")`

			`# Other ways to parse the HTML to check for differences`

			`# div_id = "wrapper"`
			`# whole_page = soup.find("div", {"id": div_id})`

			`# main_id = "main"`
			`# main_content = whole_page.find("main", {"id": main_id})`

			`text_id = "content-holder"`
			`text_content = soup.find("div", {"class": text_id})`

			`return text_content.text`


			`if __name__ == '__main__':`
Change testing to false 2023-01-31 19:55:09 -04:00			`testing = False`
Adds a way to test, and console bell Also fixes exiting the program when done checking 2023-01-24 15:39:53 -04:00			`will_exit = False`

Add way to change hour interval 2023-01-24 15:17:49 -04:00			`hours = 3`

			`arg_count = len(sys.argv)`
Adds a way to test, and console bell Also fixes exiting the program when done checking 2023-01-24 15:39:53 -04:00			`if arg_count == 1: # No arguments`
Add way to change hour interval 2023-01-24 15:17:49 -04:00			`print("No arguments given, using default value of 3 hours")`
			`else:`
			`try:`
			`hours = int(sys.argv[1]) # First argument is the script name, second is the first argument`
			`print("Using argument value of " + str(hours) + " hours")`
			`except ValueError:`
			`print("Invalid argument given, using default value of 3 hours")`

Initial commit 2023-01-24 15:01:58 -04:00			`# start = time.time()`

			`# 60 seconds * 60 minutes * 3 hours`
Add way to change hour interval 2023-01-24 15:17:49 -04:00			`timer = 60 * 60 * hours`
Initial commit 2023-01-24 15:01:58 -04:00			`text = scrape()`
Add way to change hour interval 2023-01-24 15:17:49 -04:00			`print("Started checking for new jobs")`
Initial commit 2023-01-24 15:01:58 -04:00
			`while True:`
			`try:`
			`tmp = scrape()`
Adds a way to test, and console bell Also fixes exiting the program when done checking 2023-01-24 15:39:53 -04:00			`if text != tmp or testing:`
			`print("\a")`
Initial commit 2023-01-24 15:01:58 -04:00			`print("New job posting! Here's the text:")`
			`print(tmp)`
			`print("And here's the link: https://kingslanding.nb.ca/employment/")`
			`print("Do you want to keep checking? (y/n): ", end="")`
			`choice = input()`
			`if choice == "n" or choice == "N":`
Adds a way to test, and console bell Also fixes exiting the program when done checking 2023-01-24 15:39:53 -04:00			`will_exit = True`
Initial commit 2023-01-24 15:01:58 -04:00			`else:`
			`text = tmp`
			`except urllib.error.URLError as err:`
			`print("An error occurred when scraping: {}".format(err.reason))`

			`except Exception as err:`
			`print("An unknown error has occurred: {}".format(err))`

			`finally:`
Adds a way to test, and console bell Also fixes exiting the program when done checking 2023-01-24 15:39:53 -04:00			`if will_exit:`
			`exit(0)`
			`else:`
			`time.sleep(timer)`