CSV file being exported empty and only the headers are showing? - python

So Im learning more about python everyday. Im doing a mini web scrape project and at the very end when I should see the results on an exported csv - it comes up blank except for the headers. Any help is gladly appreciated! Thanks.
The code is below:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.boxofficemojo.com/year/"
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
box_office_table = soup.find("div", class_="a-section mojo-body aok-relative").find_all("tr")
with open('imdbmovies.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
# Write headers to CSV file
writer.writerow(['numone_release', 'year', 'total_gross', 'releases', 'average', 'gross_change'])
for row in box_office_table:
try:
year_cell = row.find("td", class_="a-text-left mojo-header-column mojo-field-type-year mojo-sort-column")
money_cells = row.find_all("td", class_="a-text-right mojo-field-type-money")
releases_cell = row.find("td", class_="a-text-right mojo-field-type-positive_integer")
gross_change_cell = row.find("td", class_="a-text-right mojo-number-delta mojo-field-type-percent_delta")
numone_release_cell = row.find("td", class_="a-text-left mojo-field-type-release mojo-cell-wide")
if len(money_cells) >= 2 and year_cell is not None and releases_cell is not None and gross_change_cell is not None and numone_release_cell is not None:
total_gross_cell = money_cells[0]
average_cell = money_cells[1]
year = year_cell.text.strip()
total_gross = total_gross_cell.text.strip()
releases = releases_cell.text.strip()
average = average_cell.text.strip()
gross_change = gross_change_cell.text.strip()
numone_release = numone_release_cell.text.strip()
print(year, total_gross, releases, average, gross_change, numone_release)
# Write the row to the CSV file
writer.writerow([numone_release, year, total_gross, releases, average, gross_change])
except AttributeError:
# Either a cell is not found
pass

Related

Why isn't this web-scraping code returning any results?

I am trying get the headlines of each and everyday from economic times India from 2020-01-01 to 2020-12-31, this is what I have tried:
import requests
from bs4 import BeautifulSoup
import time
import datetime
from dateutil import rrule
from calendar import monthrange
import csv
def read_url(year, month, starttime):
url = f'https://economictimes.indiatimes.com/archivelist/year-{year},month-{month},starttime-{starttime}.cms'
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to retrieve data from the website. Response status code: {response.status_code}")
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_starttime(year,month,day):
date1= '1990-12-30'
timestamp1 = time.mktime(datetime.datetime.strptime(date1, '%Y-%m-%d').timetuple())
date2 = str(year) + '-' + str(month) + '-' + str(day)
timestamp2 = time.mktime(datetime.datetime.strptime(date2, '%Y-%m-%d').timetuple())
starttime=((timestamp2 - timestamp1)/86400)
return str(starttime).replace(".0", "")
headlines_from = '2020-01-01'
headlines_to = '2020-10-31'
headlines_datetime_from = datetime.datetime.strptime(headlines_from, '%Y-%m-%d')
headlines_datetime_to = datetime.datetime.strptime(headlines_to, '%Y-%m-%d')
for dt in rrule.rrule(rrule.MONTHLY,dtstart= headlines_datetime_from,until=headlines_datetime_to):
year = int(dt.strftime('%Y'))
month = int(dt.strftime('%m'))
for day in range (1,(monthrange(year,month)[1]+1)):
starttime = get_starttime(year,month,day)
data_str_eng = str(year) + '-'+ '{:02d}'.format(month) + '-' +'{:02d}'.format(day)
headlines = []
soup = read_url(year,month,starttime)
for td in soup.findAll('td',{'class':'contentbox5'}):
for headline in td.findAll('a'):
if 'archive' not in headline.get('href'):
if len(headline.contents)>0:
if headline.contents[0] not in headlines:
headlines.append(headlines.contents[0])
time.sleep(1)
file = open(f'C:/Users/somar/OneDrive - Technological University of the Shannon Midwest/mythesis/mynew thesis topic/economic_news_headlines_{data_str_eng}.csv', 'w')
with file:
write = csv.writer(file, escapechar = '\\' , quoting = csv.QUOTE_NONE)
for item in headlines:
write.writerow([item,])
The code is running properly but i am getting 0kb data files.
This works for me. The issue is probably with code that creates the dates for your url. Perhaps those urls are sending you to an "empty" page that doesn't contain the html your searching for. Respectfully, it seems like you have spent little time troubleshooting your code, so please do so. If you run into other problems come back and I'll help you out.
Note: I changed read_url to take url's just for the sake of trouble shooting. I also passed 2 new arguments to the open function to fix an encoding error and to not have a newline every other row (in the csv), respectively.
def read_url(url):
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to retrieve data from the website. Response status code: {response.status_code}")
soup = BeautifulSoup(response.text, 'html.parser')
return soup
if __name__ == '__main__':
url = "https://economictimes.indiatimes.com/archivelist/year-2022,month-12,starttime-44911.cms"
# get sites page source, pass it to bs obj, return bs obj
soup = read_url(url)
headlines = []
for td in soup.findAll('td',{'class':'contentbox5'}):
for headline in td.findAll('a'):
if 'archive' not in headline.get('href'):
if len(headline.contents)>0:
if headline.contents[0] not in headlines:
headlines.append(headline.contents[0])
# print(headlines)
# write to file using utf-8 encoding and without adding a newline every other row
file = open(f'PATH/economic_news_headlines.csv', 'w', encoding="utf-8", newline='')
with file:
write = csv.writer(file, escapechar = '\\' , quoting = csv.QUOTE_NONE)
for item in headlines:
write.writerow([item,])

Handling HTML in python

I had a problem when I took out html files and imported them into excel.
This is the site i need to get information: https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html
As you can see, in the GDP table I have a row named : 年份 separated from 2 lines
That's why after i exported the excel file it gave unexpected results
The result I want is that the first line in excel will only have : 年份 , GDP(美元), 占世界%
Sorry for my confusing explanation, I really don't know how to explain it in detail.
Here is my python code
import requests
from bs4 import BeautifulSoup
import lxml
import csv
def get_html(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
r = "fail"
return r
def getGDP(ulist,html):
soup = BeautifulSoup(html,"html.parser")
trs = soup.find_all('tr')
for tr in trs:
list = []
for th in tr:
ts = th.string
if ts == '\n':
continue
list.append(ts)
ulist.append(list)
def saveGDP(ulist):
file_name = '21095010 胡碧玉 GDP.csv'
with open(file_name,'w',errors='ignore',newline='') as f:
f_csv = csv.writer(f)
f_csv.writerows(ulist)
def main():
unifo=[]
url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
html=get_html(url)
getGDP(unifo,html)
saveGDP(unifo)
if __name__=="__main__":
main()
Thank you so much!
Using pandas scraping tables and cleaning of results in most cases is mutch easier - under the hood beautifulsoup is working for you.
In this case read_html() the table, drop the unwanted header level and filter out the rows containings ads:
import pandas as pd
df = pd.read_html('https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html')[0].droplevel(0, axis=1)
df[~df.iloc[:,0].str.contains('ads')].to_csv('21095010 胡碧玉 GDP.csv', index=False)
Answering your question
You have to select your elements more specific e.g. with css selectors.
So first get the thead information from all th witout colspan, than collect the data from all tr in tbody that do not contains ads:
def getGDP(html):
soup = BeautifulSoup(html,"html.parser")
data = []
data.append([th.text for th in soup.select('thead th:not([colspan])')])
for row in soup.select('tbody tr:not(:-soup-contains("ads"))'):
data.append(list(row.stripped_strings))
return data
Example
import requests
from bs4 import BeautifulSoup
import lxml
import csv
def get_html(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
r = "fail"
return r
def getGDP(html):
soup = BeautifulSoup(html,"html.parser")
data = []
data.append([th.text for th in soup.select('thead th:not([colspan])')])
for x in soup.select('tbody tr:not(:-soup-contains("ads"))'):
data.append(list(x.stripped_strings))
return data
def saveGDP(ulist):
file_name = '21095010 胡碧玉 GDP.csv'
print(ulist)
with open(file_name,'w',errors='ignore', encoding='utf-8') as f:
f_csv = csv.writer(f)
f_csv.writerows(ulist)
def main():
url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
html=get_html(url)
saveGDP(getGDP(html))
if __name__=="__main__":
main()

Write.CSV to a folder showing file not defined error

I have a script which produces multiple .csv files and each .csv file has its own name which is a variable. I am trying to save these files to a specific path instead of saving them to the Python folder.
I have tried this tutorial Specify path in write.csv function but it gave me this error: NameError: name 'file' is not defined and I tried to find other people who had the same issue when using write.csv but was unable to find any
I am on MacOS
Here is the code:
path = '/Users/chris/Desktop/cd'
fcsv = csv.writer(open, file.path(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
I have tried multiple examples of writing csv to file path and have had 0 success. If anyone has any ideas or suggestions I'd love to hear them.
Here is my full code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
headers = []
datarows = []
# define 1-1-2020 as a datetime object
after_date = datetime(2020, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get('https://bitinfocharts.com/top-100-richest-dogecoin-addresses-20.html')
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
# check to make sure the date field isn't empty
if last_out_str != "":
# load date into datetime object for comparison (second part is defining the layout of the date as years-months-days hour:minute:second timezone)
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z")
# if check to see if the date is after 2020/1/1
if last_out > after_date:
address_links.append(url)
for url in address_links:
r = s.get(url)
soup = bs(r.content, 'lxml')
table = soup.find(id="table_maina")
#Get the Doge Address for the filename
item = soup.find('h1').text
newitem = item.replace('Dogecoin', '')
finalitem = newitem.replace('Address', '')
finalitem = finalitem.replace(' ', '')
#Get the profit
sections = soup.find_all(class_='table-striped')
for section in sections:
oldprofit = section.find_all('td')[11].text
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
# Compare profit to goal
goal = float(50000)
if profit < goal:
continue
if table:
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append([td.text for td in row.find_all('td')])
path = '/Users/chris/Desktop/cd'
fcsv = csv.writer(open(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
You can have automatic file closing using a with statement:
with open(f'{finalitem}.csv', 'w', newline='') as csvfile:
fcsv = csv.DictWriter(csvfile, fieldnames=headers) # assuming headers is a list object
fcsv.writeheader()
fcsv.writerows(datarows)
The error that you are having is that you are wrapping file.path() with elements that should be part of the open function and that you may be wanting to refer to the path variable instead of a path() function for your naming/downloading path.

BeautifulSoup Scraping Formatting

This is my first time using BeautifulSoup and I am attempting to scrap store location data from a local convenience store.
However I'm running into some issues on trying to remove empty lines when data is being passed into a CSV file, I've tried .replace('\n','') and .strip() both did not worked.
Also I'm having problems with splitting data that is scraped and contained in the same sibling method.
I've added the script below:
from bs4 import BeautifulSoup
from requests import get
import urllib.request
import sched, time
import csv
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#print (soup.prettify())
#open a file for writing
location_data = open('data/soupdata.csv', 'w', newline='')
#create the csv writer object
csvwriter = csv.writer(location_data)
cheers = soup.find('div' , id="store_container")
count = 0
#Loop for Header tags
for paragraph in cheers.find_all('b'):
header1 = paragraph.text.replace(':' , '')
header2 = paragraph.find_next('b').text.replace(':' , '')
header3 = paragraph.find_next_siblings('b')[1].text.replace(':' , '')
if count == 0:
csvwriter.writerow([header1, header2, header3])
count += 1
break
for paragraph in cheers.find_all('br'):
brnext = paragraph.next_sibling.strip()
brnext1 = paragraph.next_sibling
test1 = brnext1.next_sibling.next_sibling
print(test1)
csvwriter.writerow([brnext, test1])
location_data.close()
Sample of output generated:
Sample of what output should look like:
How can I achieve this?
Thanks in advance.
To make it slightly organized, you can try like the following. I've used .select() instead of .find_all().
import csv
from bs4 import BeautifulSoup
import requests
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
with open("output.csv","w",newline="") as infile:
writer = csv.writer(infile)
writer.writerow(["Address","Telephone","Store hours"])
for items in soup.select("#store_container .store_col"):
addr = items.select_one("b").next_sibling.next_sibling
tel = items.select_one("b:nth-of-type(2)").next_sibling
store = items.select_one("b:nth-of-type(3)").next_sibling
writer.writerow([addr,tel,store])
You just need to change the way of extracting address, tel and store hours
import csv
from bs4 import BeautifulSoup
from requests import get
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# print (soup.prettify())
# open a file for writing
location_data = open('data/soupdata.csv', 'w', newline='')
# create the csv writer object
csvwriter = csv.writer(location_data)
cheers = soup.find('div', id="store_container")
count = 0
# Loop for Header tags
for paragraph in cheers.find_all('b'):
header1 = paragraph.text.replace(':', '')
header2 = paragraph.find_next('b').text.replace(':', '')
header3 = paragraph.find_next_siblings('b')[1].text.replace(':', '')
if count == 0:
csvwriter.writerow([header1, header2, header3])
count += 1
break
for paragraph in cheers.find_all('div'):
label = paragraph.find_all('b')
if len(label) == 3:
print(label)
address = label[0].next_sibling.next_sibling
tel = label[1].next_sibling
hours = label[2].next_sibling
csvwriter.writerow([address, tel, hours])
location_data.close()

Pandas merge dataframes on same column

I am writing a scraper and I want loop through a list of links and merge all results as columns to a dataframe on same key (like a left join).
I run this code in the Ipython Notebook, the resulting csv that comes from the dataframe does not make sense, however if after running the script I merge df and df2 on the mutual column "questions", I get the join that I need, but in the script there's something wrong.
Here's the whole script, there's log in with requests but you don't have to make a user, you can run it as a guest and you just won't get all answers in the review.
import requests
from bs4 import BeautifulSoup as bs
import csv
import pandas as pd
get_url = 'https://www.g2crowd.com/login?form=login'
post_url = 'https://www.g2crowd.com/user_sessions'
review_url = 'https://www.g2crowd.com/survey_responses/salesforce-crm-review-29972'
links = []
with open("links.csv", "r") as f:
spamreader = csv.reader(f, delimiter=',')
for row in spamreader:
links.append(row)
links = links[1:]
s = requests.Session()
r = s.get(get_url)
soup = bs(r.text)
token = soup.select('input[name="authenticity_token"]')[0]['value']
username = 'email#gmail.com'
password = 'password'
payload = {"user_session[login_email]": "email#gmail.com", "user_session[password]": "password"}
payload['authenticity_token'] = token
Referer = dict(Referer=get_url)
r = s.post(post_url, data=payload, headers=Referer)
print r.status_code
df = pd.read_csv("data.csv")
#df = df.set_index('questions')
for link in links:
r = s.get(link[0])
soup = bs(r.text)
title = soup.title.contents[0]
question_wrapper = soup.findAll("div", class_="question-wrapper")
print len(question_wrapper)
questions = []
answers = []
scraped_titles = []
tricky_title = 'Salesforce CRM Review by G2 Crowd User in Transportation/Trucking/Railroad - March 13, 2013'
with open("scraped_titles.csv", "r") as f:
spamreader = csv.reader(f, delimiter=',')
for row in spamreader:
scraped_titles.append(row[0])
scraped_titles = set(scraped_titles)
if (title not in scraped_titles and title != tricky_title):
for question in question_wrapper:
q = question.label.contents[0]
a = question.div.contents[0].text
questions.append(q)
answers.append(a)
#qa = zip(questions, answers)
qa = dict(questions=questions, answers=answers)
df2 = pd.DataFrame(qa)
#df2 = df2.set_index('questions', inplace=True)
#df2.to_csv(title + ".csv", encoding='utf-8')
df = pd.merge(df, df2, how='left', on='questions')
with open("scraped_titles.csv", "a") as csvwriter:
spamreader = csv.writer(csvwriter, delimiter=',')
spamreader.writerow([unicode(title).encode("utf-8")])
else:
pass
df.to_csv("all_data.csv", encoding='utf-8')
I also tried to save every review to .csv and then merge everything with Pandas, but I get a weird, rare undocumented error:
Error: new-line character seen in unquoted field - do you need to open
the file in universal-newline mode?
I have been trying to find my error for a pretty while, if somebody can point it out it would be extremely helpful.
Also, I hope I have formatted the post according to the rules, if not please help me to correct it.

Categories

Resources