I wrote a script that will web scrape data for a list of stocks. The scraper has to get the data from 2 separate pages so each stock symbol must scrape 2 different pages. If I run the process on a list that is 1000 items long it will take around 30 minutes to complete. It's not horrible, I can set it and forget it, but I'm wondering if there is a way to speed up the process. Maybe store the data and wait to write it all at the end instead of on each loop? Any other ideas appreciated.
import requests
from BeautifulSoup import BeautifulSoup
from progressbar import ProgressBar
import csv
symbols = {'AMBTQ','AABA','AAOI','AAPL','AAWC','ABEC','ABQQ','ACFN','ACIA','ACIW','ACLS'}
pbar = ProgressBar()
with open('industrials.csv', "ab") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerow(['Symbol','5 Yr EPS','EPS TTM'])
for s in pbar(symbols):
try:
url1 = 'https://research.tdameritrade.com/grid/public/research/stocks/fundamentals?symbol='
full1 = url1 + s
response1 = requests.get(full1)
html1 = response1.content
soup1 = BeautifulSoup(html1)
for hist_div in soup1.find("div", {"data-module-name": "HistoricGrowthAndShareDetailModule"}):
EPS5yr = hist_div.find('label').text
except Exception as e:
EPS5yr = 'Bad Data'
pass
try:
url2 = 'https://research.tdameritrade.com/grid/public/research/stocks/summary?symbol='
full2 = url2 + s
response2 = requests.get(full2)
html2 = response2.content
soup2 = BeautifulSoup(html2)
for div in soup2.find("div", {"data-module-name": "StockSummaryModule"}):
EPSttm = div.findAll("dd")[11].text
except Exception as e:
EPSttm = "Bad data"
pass
writer.writerow([s,EPS5yr,EPSttm])
Related
I want to webscrape this webpage (www.autocar.co.uk). Therefore, I want to select each car manufacturer in a drop down menu and the model to get the HREF/reference to the model website and then retrieve some information from each model page (not reflected in the code yet)
As I just started coding I would higly appreciate your input! Thanks in advance!! :)
Desired output:
https://www.autocar.co.uk/car-review/abarth/595
https://www.autocar.co.uk/car-review/abarth/595-competizione
https://www.autocar.co.uk/car-review/abarth/124-spider-2016-2019
https://www.autocar.co.uk/car-review/abarth/695-biposto-2015-2016
https://www.autocar.co.uk/car-review/ac-schnitzer/acs3-sport
https://www.autocar.co.uk/car-review/ac-schnitzer/acs1
https://www.autocar.co.uk/car-review/ac-schnitzer/acs5-sport
https://www.autocar.co.uk/car-review/allard/j2x-mkii
https://www.autocar.co.uk/car-review/alfa-romeo/giulia
https://www.autocar.co.uk/car-review/alfa-romeo/tonale
Output as of now --> we need to remove the "https://www.autocar.co.uk0":
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/abarth/595
https://www.autocar.co.uk/car-review/abarth/595-competizione
https://www.autocar.co.uk/car-review/abarth/124-spider-2016-2019
https://www.autocar.co.uk/car-review/abarth/695-biposto-2015-2016
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/ac-schnitzer/acs3-sport
https://www.autocar.co.uk/car-review/ac-schnitzer/acs1
https://www.autocar.co.uk/car-review/ac-schnitzer/acs5-sport
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/allard/j2x-mkii
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/alfa-romeo/giulia
https://www.autocar.co.uk/car-review/alfa-romeo/tonale
Code as of now:
from bs4 import BeautifulSoup
import requests
import pandas as pd
#Inputs/URLs to scrape:
url = "http://www.autocar.co.uk/"
s = requests.Session()
r = s.get(url)
soup = BeautifulSoup(r.text,'html.parser')
full_car_list = []
car_list = [(x.text, x.get("value"), f'https://www.autocar.co.uk/ajax/car-models/{x.get("value")}/0') for x in soup.select_one('#edit-make').select('option')]
for x in car_list:
r = s.get(x[2])
try:
for item in r.json()['options'].items():
#Car Model
car_model_url = (f'https://www.autocar.co.uk{item[0]}')
print(car_model_url)
except Exception as e:
full_car_list.append((x[0], 'no models', f'https://www.autocar.co.uk/vehicles/{x[0]}'))
You'll want to refactor things into a couple of functions for clarity; that also makes it easier to skip data that isn't valid (apparently occasionally you'd get a list from the ajax/car-models API):
from bs4 import BeautifulSoup
import requests
sess = requests.Session()
def get_make_info():
resp = sess.get("http://www.autocar.co.uk/")
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')
for option in soup.select('#edit-make option'):
make_id = option['value']
yield (make_id, option.text)
def get_make_models(make_id):
info_url = f'https://www.autocar.co.uk/ajax/car-models/{make_id}/0'
resp = sess.get(info_url)
resp.raise_for_status()
data = resp.json()
options = data['options']
if isinstance(options, list): # Invalid format, skip
return
for model_url, model_name in options.items():
if model_url == "0": # "All models"
continue
model_url = f'https://www.autocar.co.uk{model_url}'
yield (model_url, model_name)
for make_id, make_name in get_make_info():
for model_url, model_name in get_make_models(make_id):
print(make_id, make_name, model_url, model_name)
Using the code as written for your previous question, all you have to do is print out the 'Url' column of the dataframe:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "http://www.autocar.co.uk/"
s = requests.Session()
r = s.get(url)
soup = BeautifulSoup(r.text,'html.parser')
full_car_list = []
car_list = [(x.text, x.get("value"), f'https://www.autocar.co.uk/ajax/car-models/{x.get("value")}/0') for x in soup.select_one('#edit-make').select('option')]
for x in car_list:
r = s.get(x[2])
try:
for item in r.json()['options'].items():
full_car_list.append((x[0], item[1], f'https://www.autocar.co.uk{item[0]}'))
except Exception as e:
full_car_list.append((x[0], 'no models', f'https://www.autocar.co.uk/vehicles/{x[0]}'))
cars_df = pd.DataFrame(full_car_list[1:], columns = ['Make', 'Model', 'Url'])
cars_df = cars_df[cars_df.Model != 'All models']
cars_df.to_csv('makes_models.csv')
for x in cars_df.Url.tolist():
print(x)
as of right now i have a working code which is a web scraper that logs into indeed job search site. My issue now is tha I need to create a csv file that shows every single job position that was found, it gives me the numer of positions available and the description of one of them. Hope i can get some help, I would greatly apreciate it.
import re
import csv
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
jk_pattern = re.compile(r"jk:\'([a-zA-Z0-9]+)'")
params = { "q": "mechanical+engineer", "l": "united+states", "start": 0 }
url = "https://www.indeed.com/jobs"
job_keys = set()
for x in range(10):
response = requests.get(url, params=params)
if not response.status_code == 200:
break
else:
keys = jk_pattern.findall(response.text)
if len(keys) > 0:
for key in keys:
job_keys.add(key)
params['start'] += 20
sleep(randint(0, 3))
len(job_keys)
template = "https://www.indeed.com/viewjob?jk={}"
jk = job_keys.pop()
job_url = template.format(jk)
response = requests.get(job_url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.find("div", id="jobDescriptionText").text)
def get_record(card):
"""Extract job data from a single record"""
job_title = card.h2.a.get('title')
company = card.find('span', 'company').text.strip()
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')
summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
job_url = 'https://www.indeed.com' + card.h2.a.get('href')
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
salary = salary_tag.text.strip()
else:
salary = ''
record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
return record
def main(position, location):
"""Run the main program routine"""
records = []
url = get_url(position, location)
# extract the job data
while True:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', 'jobsearch-SerpJobCard')
for card in cards:
record = get_record(card)
records.append(record)
try:
url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
except AttributeError:
break
# save the job data
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
writer.writerows(records)
I'm looking to parse a specific Yahoo stock page using a Python script (take https://finance.yahoo.com/quote/NOA?ltr=1 for example) and print the "Recommended Rating" to a file. Recommended rating can be found on the right hand side of the page about half way down.
This is what I have so far
try:
import urllib.request as urllib2
except ImportError:
import urllib2
from bs4 import BeautifulSoup
quote_page = 'https://finance.yahoo.com/quote/NOA?ltr=1'
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, "html.parser")
name_box = soup.find(attrs={'div': 'rating-text Arrow South Fw(b) Bgc($strongBuy) Bdtc($strongBuy)'})
name = name_box.text.strip()
print(name)
The tricky part is that I believe the recommended rating is only listed on the page as InnerHTML. I'm not sure how i'd go about retrieving this data, a push in the right direction would be greatly appreciated!
Yahoo makes a get request to the url in the script below for some of their data. If you look in the network tab of the developer tools and refresh the page for NOA stock you should see 'NOA?formatt...'. Click this and then view the response object to see some of the data. You'll need the requests module for the script below to work: pip install requests.
# get_mean_recs.py
import csv
from datetime import datetime
import requests
import sys
get_date = lambda : datetime.utcnow().strftime('%d-%m-%Y')
lhs_url = 'https://query2.finance.yahoo.com/v10/finance/quoteSummary/'
rhs_url = '?formatted=true&crumb=swg7qs5y9UP&lang=en-US®ion=US&' \
'modules=upgradeDowngradeHistory,recommendationTrend,' \
'financialData,earningsHistory,earningsTrend,industryTrend&' \
'corsDomain=finance.yahoo.com'
def get_mean_rec(ticker):
url = lhs_url + ticker + rhs_url
r = requests.get(url)
if not r.ok:
return -1
result = r.json()['quoteSummary']['result'][0]
return result['financialData']['recommendationMean']['fmt']
def read_from_csv(fn):
with open(fn, 'r') as f:
reader = csv.reader(f)
for line in reader:
for ticker in line:
yield ticker
def write_to_csv(fn, data):
with open(fn, 'a') as f:
fieldnames = data[0].keys()
writer = csv.DictWriter(f, fieldnames=fieldnames)
for item in data:
writer.writerow(item)
def assemble_dict(ticker):
return {
'ticker': ticker,
'mean_rec': get_mean_rec(ticker),
'utc_date': get_date()
}
def main():
in_fn = sys.argv[1]
out_fn = sys.argv[2]
data = [assemble_dict(ticker) for ticker in read_from_csv(in_fn)]
write_to_csv(out_fn, data)
if __name__ == '__main__':
main()
Usage:
python get_mean_recs.py input.csv output.csv
There is an API for accessing the yahoo finance information, e.g.
http://finance.yahoo.com/d/quotes.csv?s=NOA&f=snd1l1yr
I think you may be better off using that to fetch the required information. Some more info on the parameters can be found here:
http://wern-ancheta.com/blog/2015/04/05/getting-started-with-the-yahoo-finance-api/
I have been struggling with this for over a week now. I am trying to learn Python and build something that will be useful to me at the sametime - something to help me find a new home to rent.
I have all the code working the way I want it - except I can not get all 550 properties, I can only get the first 25 on page 1. I have tried several methods but nothing seems to work.
If I use urlopen and take the main url + the regex of the page number "2_p/" for example I get a urlerror unknown url "h'.
If I use webdriver- firefox tries to look up www.h.com. I really could use a little help. Attached is my code- sorry it is kinda messy and probably a little long - I am just learning, so don't be to cruel.
from urllib.request import urlopen
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import datetime
from datetime import timedelta
import time
import re
pages1 = set()
next_page = ()
csv_output = [ ]
def getLinks(url):
global pages
driver = webdriver.Firefox()
driver.get(url)
time.sleep(3)
pageSource = driver.page_source
bsObj = BeautifulSoup(pageSource)
for addr_link in bsObj.findAll("a", href=re.compile("^/homedetails/*")):
if 'href' in addr_link.attrs:
if addr_link['href'] not in pages1:
newPage = addr_link.attrs['href']
pages1.add(newPage)
#print(newPage)
print(len(pages1))
for link in pages1:
property_url = link
getData(property_url)
def getData(url):
base_url = 'http://www.zillow.com'
final_url = base_url+url
html = urlopen(final_url)
bsObj = BeautifulSoup(html)
try:
# Gets Property Address
address = bsObj.find("header", {"class":"zsg-content-header addr"}).find("h1")
s_address = address.get_text()
print(address)
# Gets number of bedsrooms
beds = bsObj.find("span", {"class":"addr_bbs"})
s_beds = beds.get_text()
# Gets number of bathsrooms
baths = beds.find_next("span", {"class":"addr_bbs"})
s_baths = baths.get_text()
# Gets sqft
sqft = baths.find_next("span", {"class":"addr_bbs"})
s_sqft = sqft.get_text()
# Gets rent_amount
rent_amount = bsObj.find("div", {"class":"main-row home-summary-row"}).span
s_rent_amount = rent_amount.get_text()
# Gets rent_zestiment
zestiment_holder = bsObj.find("span", {"class":"zsg-tooltip-launch zsg-tooltip-launch_keyword"})
rent_zestiment = zestiment_holder.find_next("span")
s_rent_zestiment = rent_zestiment.get_text()
# Gets Date posted on Zillow
for time_posted in bsObj(text=re.compile("Posted")):
posted = time_posted.parent.get_text()
if 'hours' in posted:
date_on_zillow = datetime.date.today()
posted_date = date_on_zillow
else:
days_subtracted = int(re.search(r'\d+', posted).group())
posted_date = datetime.date.today()-datetime.timedelta(days=+days_subtracted)
# Gets Property subdivision
subdivision = bsObj.find(id="hdp-neighborhood").h2
s_subdivision = subdivision.get_text()
# Gets property_manager_name
property_manager_name = bsObj.find("span", {"class":"snl company-name"})
s_property_manager_name = property_manager_name.get_text()
# Gets property_manager_phone
property_manager_phone = bsObj.find("span", {"class":"snl phone"})
s_property_manager_phone = property_manager_phone.get_text()
# Gets disc_of_property
disc_of_property =(bsObj.find('div', {'class': "notranslate"}).text)
s_disc_of_property = disc_of_property.encode("utf-8")
# Gets url_of_listing so I can see Photos if interested
main_url = 'http://www.zillow.com'
url2 = url
listing_url = main_url+url2
s_listing_url = listing_url
except AttributeError as e:
return None
csv_data = [s_address, s_beds, s_baths, s_sqft, s_rent_amount, s_rent_zestiment, posted_date, s_subdivision, s_property_manager_name, s_property_manager_phone, s_disc_of_property, s_listing_url]
csv_output.append(csv_data)
resultFile = open("output.csv",'w+')
wr = csv.writer(resultFile)
wr.writerows(csv_output)
resultFile.close()
header = ['Address', 'Beds', 'Baths', 'Sqft', 'Rent Amount', 'rent Zestiment', 'Posted Date', 'Subdivision', 'Property Manager Name', 'Property Manager Phone', 'Disc of Property', 'URL']
csv_output.append(header)
getLinks("http://www.zillow.com/homes/for_rent/Jackson-County-MO/house,mobile_type/1804_rid/6m_days/39.371994,-93.635788,38.697836,-95.077744_rect/9_zm/")
Edit:
The 'while' that has been commented out was my last attempt at handling the pagination.
I'm trying to use multithreads to go through a txt file of urls and scrape the contents found at each url. This works for about 20 URLs (not consistent how many) but then consistently gets stuck on the last url in the file. It doesn't seem to be doing them in order.
I have no idea why it's getting stuck or where to start so thank you so much for your help.
from bs4 import BeautifulSoup, SoupStrainer
import urllib3
import urllib2
import io
import os
import re
import workerpool
from urllib2 import Request, urlopen, URLError
NUM_SOCKETS = 3
NUM_WORKERS = 5
urlfile = open("dailynewsurls.txt",'r') # read one line at a time until end of file
http = urllib3.PoolManager(maxsize=NUM_SOCKETS)
workers = workerpool.WorkerPool(size=NUM_WORKERS)
class MyJob(workerpool.Job):
def __init__(self, url):
self.url = url
def run(self):
r = http.request('GET', self.url)
req = urllib2.Request(url)
try:
page = urllib2.urlopen(req)
except:
print "had to skip one"
return
pagecontent = page.read() # get a file-like object at this url
#this tells it to soup the page that is at the url above
soup = BeautifulSoup(pagecontent)
#this tells it to find the string in the first instance of each of the tags in the parenthesis
title = soup.find_all('title')
article = soup.find_all('article')
try:
title = str(title[0].get_text().encode('utf-8'))
except:
print "had to skip one"
return
try:
article = str(article[0].get_text().encode('utf-8'))
except:
print "had to skip one"
return
try:
# make the file using the things above
output_files_pathname = 'DailyNews/' # path where output will go
new_filename = title + ".txt"
# write each of the things defined into the text file
outfile = open(output_files_pathname + new_filename,'w')
outfile.write(title)
outfile.write("\n")
outfile.write(article)
outfile.close()
print "%r added as a text file" % title
return
except:
print "had to skip one"
return
return
for url in urlfile:
workers.put(MyJob(url))
workers.shutdown()
workers.wait()
print "All done."
Here's an example list of the urls:
http://www.nydailynews.com/entertainment/tv-movies/x-factor-season-2-episode-2-recap-oops-britney-spears-article-1.1159546
http://www.nydailynews.com/new-york/brooklyn/lois-mclohon-resurfaced-iconic-daily-news-coney-island-cheesecake-photo-brings-back-memories-50-year-long-romance-article-1.1160457
http://www.nydailynews.com/new-york/uptown/espaillat-linares-rivals-bitter-history-battle-state-senate-seat-article-1.1157994
http://www.nydailynews.com/sports/baseball/mlb-power-rankings-yankees-split-orioles-tumble-rankings-nationals-shut-stephen-strasburg-hang-top-spot-article-1.1155953
http://www.nydailynews.com/news/national/salon-sell-internet-online-communities-article-1.1150614
http://www.nydailynews.com/sports/more-sports/jiyai-shin-wins-women-british-open-dominating-fashion-record-nine-shot-victory-article-1.1160894
http://www.nydailynews.com/entertainment/music-arts/justin-bieber-offered-hockey-contract-bakersfield-condors-minor-league-team-article-1.1157991
http://www.nydailynews.com/sports/baseball/yankees/umpire-blown-call-9th-inning-dooms-yankees-5-4-loss-baltimore-orioles-camden-yards-article-1.1155141
http://www.nydailynews.com/entertainment/gossip/kellie-pickler-shaving-head-support-best-friend-cancer-fight-hair-article-1.1160938
http://www.nydailynews.com/new-york/secret-103-000-settlement-staffers-accused-assemblyman-vito-lopez-sexual-harassment-included-penalty-20k-involved-talked-details-article-1.1157849
http://www.nydailynews.com/entertainment/tv-movies/ricki-lake-fun-adds-substance-new-syndicated-daytime-show-article-1.1153301
http://www.nydailynews.com/sports/college/matt-barkley-loyalty-usc-trojans-contention-bcs-national-championship-article-1.1152969
http://www.nydailynews.com/sports/daily-news-sports-photos-day-farewell-andy-roddick-world-1-u-s-open-champ-retires-loss-juan-martin-del-potro-article-1.1152827
http://www.nydailynews.com/entertainment/gossip/britney-spears-made-move-relationship-fiance-jason-trawick-reveals-article-1.1152722
http://www.nydailynews.com/new-york/brooklyn/brooklyn-lupus-center-tayumika-zurita-leads-local-battle-disease-difficult-adversary-article-1.1153494
http://www.nydailynews.com/life-style/fashion/kate-middleton-prabal-gurung-dress-sells-hour-myhabit-site-sold-1-995-dress-599-article-1.1161583
http://www.nydailynews.com/news/politics/obama-romney-campaigns-vie-advantage-president-maintains-lead-article-1.1161540
http://www.nydailynews.com/life-style/free-cheap-new-york-city-tuesday-sept-11-article-1.1155950
http://www.nydailynews.com/news/world/dozens-storm-embassy-compound-tunis-article-1.1159663
http://www.nydailynews.com/opinion/send-egypt-message-article-1.1157828
http://www.nydailynews.com/sports/more-sports/witnesses-feel-sheryl-crow-lance-amstrong-activities-article-1.1152899
http://www.nydailynews.com/sports/baseball/yankees/hiroki-kuroda-replacing-cc-sabathia-yankees-ace-pitcher-real-possibility-playoffs-looming-article-1.1161812
http://www.nydailynews.com/life-style/eats/finland-hosts-pop-down-restaurant-belly-earth-262-feet-underground-article-1.1151523
http://www.nydailynews.com/sports/more-sports/mighty-quinn-sept-23-article-1.1165584
http://www.nydailynews.com/sports/more-sports/jerry-king-lawler-stable-condition-suffering-heart-attack-wwe-raw-broadcast-monday-night-article-1.1156915
http://www.nydailynews.com/news/politics/ambassador-chris-stevens-breathing-libyans-found-american-consulate-rescue-article-1.1161454
http://www.nydailynews.com/news/crime/swiss-banker-bradley-birkenfeld-104-million-reward-irs-blowing-whistle-thousands-tax-dodgers-article-1.1156736
http://www.nydailynews.com/sports/hockey/nhl-board-governors-votes-favor-lockout-league-players-association-fail-reach-agreement-cba-article-1.1159131
http://www.nydailynews.com/news/national/iphone-5-works-t-network-article-1.1165543
http://www.nydailynews.com/sports/baseball/yankees/yankees-broadcasters-michael-kay-ken-singleton-opportunity-important-statement-article-1.1165479
http://www.nydailynews.com/news/national/boss-year-michigan-car-dealer-retires-employees-1-000-year-service-article-1.1156763
http://www.nydailynews.com/entertainment/tv-movies/hero-denzel-washington-clint-eastwood-article-1.1165538
http://www.nydailynews.com/sports/football/giants/ny-giants-secondary-roasted-tony-romo-dallas-cowboys-offense-article-1.1153055
http://www.nydailynews.com/news/national/hide-and-seek-tragedy-3-year-old-suffocates-hiding-bean-bag-article-1.1160138
I would try using the threading module; here is something I think is working:
from bs4 import BeautifulSoup, SoupStrainer
import threading
import urllib2
def fetch_url(url):
urlHandler = urllib2.urlopen(url)
html = urlHandler.read()
#this tells it to soup the page that is at the url above
soup = BeautifulSoup(html)
#this tells it to find the string in the first instance of each of the tags in the parenthesis
title = soup.find_all('title')
article = soup.find_all('article')
try:
title = str(title[0].get_text().encode('utf-8'))
except:
print "had to skip one bad title\n"
return
try:
article = str(article[0].get_text().encode('utf-8'))
except:
print "had to skip one bad article"
return
try:
# make the file using the things above
output_files_pathname = 'DailyNews/' # path where output will go
new_filename = title + ".txt"
# write each of the things defined into the text file
outfile = open(output_files_pathname + new_filename, 'w')
outfile.write(title)
outfile.write("\n")
outfile.write(article)
outfile.close()
print "%r added as a text file" % title
return
except:
print "had to skip one cant write file"
return
return
with open("dailynewsurls.txt", 'r') as urlfile:
# read one line at a time until end of file
threads = [threading.Thread(target=fetch_url, args=(url,)) for url in urlfile]
for thread in threads:
thread.start()
for thread in threads:
thread.join()