Hi All,
My name is Guru Prasad. I am new to jupyter notebook. I have got this code from my friend who helped me to give me this code, which is working but the speed of the code is very slow. Please help me if anyway I can speed up the code to get results faster. I tried everything from remote run to local run but no luck in speed.
Here is the code:
import requests
import csv
import re
import time
import os
from bs4 import BeautifulSoup as bs
from random import randint
results_path = “scripts_output/master/usa”
main_url = “https://www.mastersportal.com/countries/82/united-states.html”
Create results folder if it does not exist
if not os.path.exists(results_path):
os.makedirs(results_path)
Create results folder if it does not exist
if not os.path.exists(results_path):
os.makedirs(results_path)
helpers
def get_no_from_str(x):
result = “N/A”
s = re.search(r"[0-9]{1,5}([,.][0-9]{1,2})?", x)
if s:
result = s.group()
return result
get main url data
countries_res = requests.get(main_url)
universities_page = bs(countries_res.text, “lxml”)
Write csv output header
programs_headers = [
“university_name”, “study_name”, “deadline”, “tuition_amount”, “tuition_currency”, “tuition_price_specification”,
“language”, “Course_Link”, “Course_Description”, “city”, “structure”, “start_date”,
“duration”, “application_deadline”, “language”, “facts”, “ielts_score”, “cae_score”, “toefl_score”,
“University_Rank”, “academic_req”]
programs_file = open("{}/programs{}.csv".format(results_path, round(time.time())), “a”)
programs_csv = csv.DictWriter(programs_file, fieldnames=programs_headers)
programs_csv.writeheader()
counter = 1
for u in universities_page.select("#CountryStudies li a"):
university_url = u['href']
university_name = u['title']
university_id = re.search("/\d+/", university_url).group()[1:-1]
#university_country = country_id
print("++", university_name, "(" + university_id + ")")
studies_res = requests.get(university_url)
studies_page = bs(studies_res.text, "lxml")
for s in studies_page.select("#StudyListing .StudyInfo a"):
study_url = s['href']
study_name = s['title'].replace("/", "").replace(".", "")
program_name = study_name.split(',')[:-1][0].strip()
program_type = study_name.split(',')[-1].strip()
study_id = re.search("/\d+/", study_url).group()[1:-1]
study_university = university_id
#study_country = country_id
print("+++", counter, study_name, "(" + study_id + ")")
counter += 1
details_res = requests.get(study_url)
page_text = details_res.text.strip()
page = bs(page_text, "lxml")
name_s = "#Hero h1"
name = page.select(name_s)[0].text.strip() if page.select(name_s) else 'N/A'
deadline_s = "#js-StartdateContainer .Deadline time"
deadline = page.select(deadline_s)[0].text.strip() if page.select(deadline_s) else 'N/A'
tuition_amount_s = "#TuitionFeeContainer .Amount span"
tuition_amount = page.select(tuition_amount_s)[0]['data-amount'] if page.select(tuition_amount_s) else 'N/A'
tuition_currency_s = "#TuitionFeeContainer .Amount span.CurrencyType"
tuition_currency = page.select(tuition_currency_s)[0]['data-currency-text'] if page.select(tuition_currency_s) else 'N/A'
tuition_spec_s = "#TuitionFeeContainer .Amount span.CurrencyType"
tuition_price_specification = page.select(tuition_spec_s)[1].text.strip() if page.select(tuition_spec_s) else 'N/A'
duration_s = "#StudyKeyFacts .FactListSubListItem span.Duration"
duration = page.select(duration_s)[0].text.strip().replace(u'\xa0', u' ') if page.select(duration_s) else 'N/A'
language_s = "#StudyKeyFacts .FactItem.LanguageFact.js-languageFact .Languages"
language = page.select(language_s)[0].text.strip() if page.select(language_s) else 'N/A'
university_s = "#StudyDescription .StudyLink.TrackingExternalLink.ProgrammeWebsiteLink"
university = page.select(university_s)[0]['href'] if page.select(university_s) else 'N/A'
university_rank_s = "#StudyDescription"
university_rank_st = page.select(university_rank_s)[0].text.strip() if page.select(university_rank_s) else 'N/A'
Course_Description = (university_rank_st)
location_s = "#OrganisationInformation .js-locationInformationWrapper .LocationItems a"
city = page.select(location_s)[0].text.strip() if page.select(location_s) else 'N/A'
country = page.select(location_s)[1].text.strip() if page.select(location_s) else 'N/A'
structure_s = "#StudyContents ul li"
structure = [x.text.strip() for x in page.select(structure_s)] if page.select(structure_s) else 'N/A'
start_date_s = "#js-StartdateContainer .StartDateItemTime.js-deadlineFact time"
start_date = page.select(start_date_s)[0].text.strip() if page.select(start_date_s) else 'N/A'
application_deadline_s = "#js-StartdateContainer .Deadline time"
application_deadline = page.select(application_deadline_s)[0].text.strip() if page.select(application_deadline_s) else 'N/A'
facts_s = "#StudyKeyFacts"
#facts = page.select(facts_s)[0].text.strip() if page.select(facts_s) else 'N/A'
facts = [x.text.strip() for x in page.select(facts_s)] if page.select(facts_s) else 'N/A'
ielts_score_s = "#EnglishRequirements .CardContents.IELTSCard .Score span"
ielts_score_st = page.select(ielts_score_s)[0].text.strip() if page.select(ielts_score_s) else 'N/A'
ielts_score = get_no_from_str(ielts_score_st)
cae_score_s = "#AdmissionRequirements .Score"
cae_score_st = page.select(cae_score_s)[0].text.strip() if page.select(cae_score_s) else 'N/A'
cae_score = get_no_from_str(cae_score_st)
toefl_score_s = "#EnglishRequirements .CardContents.TOEFLCard .Score span"
toefl_score_st = page.select(toefl_score_s)[0].text.strip() if page.select(toefl_score_s) else 'N/A'
toefl_score = (toefl_score_st)
fees_s = "#js-worldRankingReadMoreButton"
tuition_fees_containers = [x.text.strip().replace(u'\xa0', u' ') for x in page.select(fees_s)] if page.select(fees_s) else 'N/A'
academic_req_s = "#OtherRequirements ul li"
academic_req = [x.text.strip() for x in page.select(academic_req_s)] if page.select(academic_req_s) else 'N/A'
new_row = {
#"country_url": country_url.replace(",", ""),
#"country_name": country_name.replace(",", ""),
#"country_id": country_id.replace(",", ""),
#"university_url": university_url.replace(",", ""),
"university_name": university_name.replace(",", ""),
#"university_id": university_id.replace(",", ""),
#"university_country": university_country.replace(",", ""),
#"study_url": study_url.replace(",", ""),
"study_name": study_name,
#"program_name": program_name,
#"program_type": program_type,
#"study_id": study_id.replace(",", ""),
#"name": name,
"deadline": deadline,
"tuition_amount": tuition_amount,
"tuition_currency": tuition_currency,
"tuition_price_specification": tuition_price_specification,
"duration": duration,
"language": language,
"Course_Link": university,
"Course_Description": Course_Description,
"city": city,
#"country": country,
"structure": structure,
"start_date": start_date,
"application_deadline": application_deadline,
"facts": facts,
"ielts_score": ielts_score,
"cae_score": cae_score,
"toefl_score": toefl_score,
"University_Rank": tuition_fees_containers,
"academic_req": academic_req
}
new_row = {k: str(v) for k, v in new_row.items()}
programs_csv.writerow(new_row)
programs_file.close()