How do I creat CSV file with webscraped content from several URLs? - python
I want to create a CSV file from webscraped content. The content is from FinViz.com
I want to scrape the table from this website 20 times for 20 different stocks and input all the content into a CSV file. Within my code, I generate a list of stocks from a scrape of twitter content. The list of stocks that is generated is the same list that I want to get information on from the FinViz.com tables.
Here is my code:
import csv
import urllib.request
from bs4 import BeautifulSoup
twiturl = "https://twitter.com/ACInvestorBlog"
twitpage = urllib.request.urlopen(twiturl)
soup = BeautifulSoup(twitpage,"html.parser")
print(soup.title.text)
tweets = [i.text for i in soup.select('a.twitter-cashtag.pretty-link.js-nav b')]
print(tweets)
url_base = "https://finviz.com/quote.ashx?t="
url_list = [url_base + tckr for tckr in tweets]
for url in url_list:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# scrape single page and add data to list
# write datalist
with open('today.csv', 'a') as file:
writer = csv.writer(file)
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
The trouble that I am running into is that my CSV file only has the webscraped data from the last item in the list. Instead I want the entire list in a sequence of rows.
Here is what my CSV file looks like:
Index,P/E,EPS (ttm),Insider Own,Shs Outstand,Perf Week,Market Cap,Forward P/E,EPS next Y,Insider Trans,Shs Float,Perf Month,Income,PEG,EPS next Q,Inst Own,Short Float,Perf Quarter,Sales,P/S,EPS this Y,Inst Trans,Short Ratio,Perf Half Y,Book/sh,P/B,EPS next Y,ROA,Target Price,Perf Year,Cash/sh,P/C,EPS next 5Y,ROE,52W Range,Perf YTD,Dividend,P/FCF,EPS past 5Y,ROI,52W High,Beta,Dividend %,Quick Ratio,Sales past 5Y,Gross Margin,52W Low,ATR,Employees,Current Ratio,Sales Q/Q,Oper. Margin,RSI (14),Volatility,Optionable,Debt/Eq,EPS Q/Q,Profit Margin,Rel Volume,Prev Close,Shortable,LT Debt/Eq,Earnings,Payout,Avg Volume,Price,Recom,SMA20,SMA50,SMA200,Volume,Change
-,-,-1.75,7.94%,79.06M,-22.52%,296.48M,-,-1.74,-4.61%,72.41M,-23.16%,-85.70M,-,-0.36,62.00%,3.21%,1.63%,15.10M,19.63,-197.00%,18.05%,2.57,66.67%,-0.65,-,-8.10%,-127.70%,12.17,-6.25%,0.93,4.03,-,146.70%,2.05 - 5.86,3.59%,-,-,-,385.80%,-36.01%,-,-,1.30,-,76.50%,82.93%,0.41,100,1.30,-59.60%,-,36.98,16.13% 9.32%,Yes,-,90.00%,-,0.82,3.63,Yes,-,Nov 08,-,902.43K,3.75,2.30,-22.08%,-10.43%,11.96%,"742,414",3.31%
It would be better to open your output file first, rather than keep on opening/closing it for each URL that you fetch. Exception handling is needed to catch cases where the URL does not exist.
Also on your output, you should open the file with newline='' to avoid extra empty lines being written to the file:
import csv
import urllib.request
from bs4 import BeautifulSoup
write_header = True
twiturl = "https://twitter.com/ACInvestorBlog"
twitpage = urllib.request.urlopen(twiturl)
soup = BeautifulSoup(twitpage,"html.parser")
print(soup.title.text)
tweets = [i.text for i in soup.select('a.twitter-cashtag.pretty-link.js-nav b')]
print(tweets)
url_base = "https://finviz.com/quote.ashx?t="
url_list = [url_base + tckr for tckr in tweets]
with open('today.csv', 'w', newline='') as file:
writer = csv.writer(file)
for url in url_list:
try:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# write header row (once)
if write_header:
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
write_header = False
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except urllib.error.HTTPError:
print("{} - not found".format(url))
So today.csv would start like:
Index,P/E,EPS (ttm),Insider Own,Shs Outstand,Perf Week,Market Cap,Forward P/E,EPS next Y,Insider Trans,Shs Float,Perf Month,Income,PEG,EPS next Q,Inst Own,Short Float,Perf Quarter,Sales,P/S,EPS this Y,Inst Trans,Short Ratio,Perf Half Y,Book/sh,P/B,EPS next Y,ROA,Target Price,Perf Year,Cash/sh,P/C,EPS next 5Y,ROE,52W Range,Perf YTD,Dividend,P/FCF,EPS past 5Y,ROI,52W High,Beta,Dividend %,Quick Ratio,Sales past 5Y,Gross Margin,52W Low,ATR,Employees,Current Ratio,Sales Q/Q,Oper. Margin,RSI (14),Volatility,Optionable,Debt/Eq,EPS Q/Q,Profit Margin,Rel Volume,Prev Close,Shortable,LT Debt/Eq,Earnings,Payout,Avg Volume,Price,Recom,SMA20,SMA50,SMA200,Volume,Change
-,-,-10.85,4.60%,2.36M,11.00%,8.09M,-,-,-62.38%,1.95M,-16.14%,-14.90M,-,-,2.30%,10.00%,-44.42%,0.00M,-,21.80%,-5.24%,3.10,-38.16%,1.46,2.35,-,-155.10%,65.00,-50.47%,-,-,-,-238.40%,2.91 - 11.20,-38.29%,-,-,54.50%,-,-69.37%,1.63,-,2.20,-,-,17.87%,0.36,15,2.20,-,-,39.83,11.38% 10.28%,No,0.00,68.70%,-,1.48,3.30,Yes,0.00,Feb 28 AMC,-,62.76K,3.43,1.00,-5.21%,-25.44%,-37.33%,"93,166",3.94%
-,-,-0.26,1.50%,268.98M,3.72%,2.25B,38.05,0.22,-0.64%,263.68M,-9.12%,-55.50M,-,0.05,-,9.96%,-12.26%,1.06B,2.12,-328.10%,25.95%,2.32,17.72%,12.61,0.66,650.00%,-0.90%,12.64,-38.73%,0.03,264.87,-,-1.90%,6.69 - 15.27,-0.48%,-,-,-28.70%,0.00%,-45.17%,2.20,-,0.70,16.40%,67.80%,25.11%,0.41,477,0.80,71.90%,5.30%,52.71,4.83% 5.00%,Yes,0.80,7.80%,-5.20%,0.96,7.78,Yes,0.80,Feb 27 AMC,-,11.31M,8.37,2.20,0.99%,-1.63%,-4.72%,"10,843,026",7.58%
If you only want your file to contain data from one run of the script, you do not need a to append, just use w instead.
Related
Only print on the first column of csv
So I have this code but I am having issues when the data I am scraping has commas. I want it only show on the first column but when there's a comma, the data appears on the 2nd column. Is it possible to scrape and print it on the first column only of csv without using panda? Thanks i = 1 for url in urls: print(f'Scraping the URL no {i}') i += 1 response = requests.get(url) soup = BeautifulSoup(response.text,'html.parser') links = [] for text in soup.find('div',class_='entry-content').find_all('div',class_='streak'): link = text.a['href'] text = text.a.text links.append(link) with open("/Users/Rex/Desktop/data.csv", "a") as file_object: file_object.write(text) file_object.write("\n")
CSV files have rules for escaping commas within a single column so that they are not mistakenly interpreted as a new column. This escaping can be applied automatically if you use the csv module. You really only need to open the file once, so with a few more tweaks to your code import csv with open("/Users/Rex/Desktop/data.csv", "a", newline=None) as file_object: csv_object = csv.writer(file_object) i = 1 for url in urls: print(f'Scraping the URL no {i}') i += 1 response = requests.get(url) soup = BeautifulSoup(response.text,'html.parser') links = [] for text in soup.find('div',class_='entry-content').find_all('div',class_='streak'): link = text.a['href'] text = text.a.text.strip() # only record if we have text if text: links.append(link) csv_object.writerow([text]) NOTE: This code is skipping links that do not have text.
Iteration Url Load from CSV in Python
Please Help me I have a data url in the CSV file, in that file there are 100 rows and 1 column, I want to load data line 1 to line 100 from CSV using Python, how do I write the code line? However, after running the repetition can only work once in one of the lines does not reach the end of the url in the CSV and does not continue to the next URL. disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n') AttributeError: 'NoneType' object has no attribute 'text' how do I get through if an error occurs when html is not found? the following line of code I use python, please help so that the looping scrape runs to the end of the url list from bs4 import BeautifulSoup import requests import pandas as pd import csv import pandas with open('Url Torch.csv','rt') as f: data = csv.reader(f, delimiter=',') for row in data: URL_GO = row[2] def variable_Scrape(url): try: cookies = dict(cookie="............") request = requests.get(url, cookies=cookies) html = BeautifulSoup(request.content, 'html.parser') title = html.find('div', class_='title').text.strip().strip('\n') desc = html.find('div', class_='content').text link = html.find_all('img', class_='lazyload slide-item owl-lazy') normal_price = html.find('div', class_='amount public').text.strip().strip('\n') disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n') except AttributeError as e: print(e) #ConnectionAbortedError return False else: print(title) #print(desc) #print(link) finally: print(title) print(desc) print(link) print('Finally.....') variable_Scrape(URL_GO)
Is hard to give you the exact answer without seeing you csv file but try this: import csv f = open('you_file.csv') csv_f = csv.reader(f) for row in csv_f: print row[0]
This is the code import csv data = [] #create an empty list to store rows on it with open('emails.csv') as csv_file: reader = csv.reader(csv_file) for row in reader: data.append(row) #add each row to the list Based on your comments about passing a loop when the url is not ok: for url in data: # data is the list where url stored try: # do your code here (requests, beautifulsoup) : # r = requests.get(url) ... except: pass # will go to the next loop (next url) if an error happens
Writing data scraped from a HTML table to a CSV file
I'm trying to figure out what will be the next step to convert my webscrape to CSV. I've tried putting every column into individual lists, but I feel like this is not the solution. from bs4 import BeautifulSoup import requests url = 'https://www.pro-football-reference.com/years/2018/passing.htm' page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') for row in tb.find_all('tr'): i = row.get_text() print(i)
This should work import csv #quite crucial final_table = [] for row in tb.findall('tr'): next_line = row.get_text() final_table.append([next_line]) with open('output.csv', 'w') as f: writer = csv.writer(f) writer.writerows(final_table)
Use the csv module. We'll grab the headers with soup.find("tr").find_all("th"), then loop over the body and write it to the text file. The first cell of each row is a <th>, so we need to handle that separately and prepend it to the <td> data. Note that the staggered headers every 30 lines are omitted. import csv import requests from bs4 import BeautifulSoup url = "https://www.pro-football-reference.com/years/2018/passing.htm" soup = BeautifulSoup(requests.get(url).content, "html.parser") with open("output.csv", "w") as f: writer = csv.writer(f) writer.writerow([x.get_text() for x in soup.find("tr").find_all("th")]) for row in soup.find_all("tr"): data = [row.find("th").get_text()] + [x.get_text() for x in row.find_all("td")] if data: writer.writerow(data) Output (just the top few rows): Rk,Player,Tm,Age,Pos,G,GS,QBrec,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Lng,Y/A,AY/A,Y/C,Y/G,Rate,QBR,Sk,Yds,NY/A,ANY/A,Sk%,4QC,GWD 1,Ben Roethlisberger,PIT,36,QB,16,16,9-6-1,452,675,67.0,5129,34,5.0,16,2.4,97,7.6,7.5,11.3,320.6,96.5,71.0,24,166,7.10,7.04,3.4,2,3 2,Andrew Luck*,IND,29,QB,16,16,10-6-0,430,639,67.3,4593,39,6.1,15,2.3,68,7.2,7.4,10.7,287.1,98.7,69.4,18,134,6.79,6.95,2.7,3,3 3,Matt Ryan,ATL,33,QB,16,16,7-9-0,422,608,69.4,4924,35,5.8,7,1.2,75,8.1,8.7,11.7,307.8,108.1,68.5,42,296,7.12,7.71,6.5,1,1 4,Kirk Cousins,MIN,30,QB,16,16,8-7-1,425,606,70.1,4298,30,5.0,10,1.7,75,7.1,7.3,10.1,268.6,99.7,58.2,40,262,6.25,6.48,6.2,1,0 Check this thread if you see extra newlines in the CSV result on Windows.
Creating a text-delimited file from HTML tables using BeautifulSoup
I'm trying to create a text-delimited file containing the data from the "Actions" table on webpages like this one: http://stats.swehockey.se/Game/Events/300978 I would like each line to include the game # (from the end of the URL) and then the text from the line on the table. For example: 300972 | 60:00 | GK Out | OHK | 33. Hudacek, Julius I haven't been able to get each row to actually separate. I've tried parsing through each row and column, using a list of stripped strings, and searching by different tags, classes, and styles. Here's what I currently have: from bs4 import BeautifulSoup import urllib.request def createtext(): gamestr = urlstr + "|" #Find all table lines. Create one pipe-delimited line for each. aptext = gamestr for el in soup.find_all('tr'): playrow = el.find_all('td', 'tdOdd') for td in playrow: if(td.find(text=True)) not in ("", None, "\n"): aptext = aptext + ''.join(td.text) + "|" aptext = aptext + "\n" + gamestr #Creates file with Game # as filename and writes the data to the file currentfile = urlstr + ".txt" with open(currentfile, "w") as f: f.write(str(aptext)) #Grabs the HTML file and creates the soup urlno = 300978 urlstr = str(urlno) url = ("http://stats.swehockey.se/Game/Events/" + urlstr) request = urllib.request.Request(url) response = urllib.request.urlopen(request) pbpdoc = response.read().decode('utf-8') soup = BeautifulSoup(pbpdoc) createtext() Thanks for any help or guidance!
First of all, you don't have to construct the CSV data manually, Python provides a built-in csv module for that. Then, since you are up to "actions" only, I'd identify the "actions" table and find the events-only rows. This can be done with the help of a filtering function checking the first cell to not be empty: import csv from bs4 import BeautifulSoup import requests def only_action_rows(tag): if tag.name == 'tr': first_cell = tag.find('td', class_='tdOdd') return first_cell and first_cell.get_text(strip=True) event_id = 300978 url = "http://stats.swehockey.se/Game/Events/{event_id}".format(event_id=event_id) response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") actions_table = soup.find("h2", text="Actions").find_parent("table") data = [[event_id] + [td.get_text(strip=True) for td in row.find_all('td', class_='tdOdd')] for row in actions_table.find_all(only_action_rows)] with open("output.csv", "w") as f: writer = csv.writer(f) writer.writerows(data) Note that I'm using requests here.
How to write a new column to csv when webscraping?
I'd like to get some quick help on writing this webscraping program. So far it's scraping things correctly, but I'm having trouble writing it to a csv file. I'm scraping two things from each reviewer: Review score AND written review I'd like to write the review score into the first column, and the written review into the second column. However, writerow only does it row by row. Appreciate any help on this! :) import os, requests, csv from bs4 import BeautifulSoup # Get URL of the page URL = ('https://www.tripadvisor.com/Attraction_Review-g294265-d2149128-Reviews-Gardens_by_the_Bay-Singapore.html') with open('GardensbytheBay.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) # Looping until the 5th page of reviews for pagecounter in range(3): # Request get the first page res = requests.get(URL) res.raise_for_status # Download the html of the first page soup = BeautifulSoup(res.text, "html.parser") # Match it to the specific tag for all 5 ratings reviewElems = soup.findAll('img', {'class': ['sprite-rating_s_fill rating_s_fill s50', 'sprite-rating_s_fill rating_s_fill s40', 'sprite-rating_s_fill rating_s_fill s30', 'sprite-rating_s_fill rating_s_fill s20', 'sprite-rating_s_fill rating_s_fill s10']}) reviewWritten = soup.findAll('p', {'class':'partial_entry'}) if reviewElems: for row, rows in zip(reviewElems, reviewWritten): review_text = row.attrs['alt'][0] review2_text = rows.get_text(strip=True).encode('utf8', 'ignore').decode('latin-1') writer.writerow([review_text]) writer.writerow([review2_text]) print('Writing page', pagecounter + 1) else: print('Could not find clue.') # Find URL of next page and update URL if pagecounter == 0: nextLink = soup.select('a[data-offset]')[0] elif pagecounter != 0: nextLink = soup.select('a[data-offset]')[1] URL = 'http://www.tripadvisor.com' + nextLink.get('href') print('Download complete')
You can put the review score and text in the same row but different columns with: writer.writerow([review_text, review2_text]) Your initial approach takes each of the items as a separate row and writes them in succession which is not what you want.
You can use pandas dataFrame: import pandas as pd import numpy as np csv_file = pd.read_csv('GardensbytheBay.csv') csv_file.insert(idx, cloname, value) csv_input.to_csv('output.csv', index=False)