I have a script which produces multiple .csv files and each .csv file has its own name which is a variable. I am trying to save these files to a specific path instead of saving them to the Python folder.
I have tried this tutorial Specify path in write.csv function but it gave me this error: NameError: name 'file' is not defined and I tried to find other people who had the same issue when using write.csv but was unable to find any
I am on MacOS
Here is the code:
path = '/Users/chris/Desktop/cd'
fcsv = csv.writer(open, file.path(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
I have tried multiple examples of writing csv to file path and have had 0 success. If anyone has any ideas or suggestions I'd love to hear them.
Here is my full code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
headers = []
datarows = []
# define 1-1-2020 as a datetime object
after_date = datetime(2020, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get('https://bitinfocharts.com/top-100-richest-dogecoin-addresses-20.html')
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
# check to make sure the date field isn't empty
if last_out_str != "":
# load date into datetime object for comparison (second part is defining the layout of the date as years-months-days hour:minute:second timezone)
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z")
# if check to see if the date is after 2020/1/1
if last_out > after_date:
address_links.append(url)
for url in address_links:
r = s.get(url)
soup = bs(r.content, 'lxml')
table = soup.find(id="table_maina")
#Get the Doge Address for the filename
item = soup.find('h1').text
newitem = item.replace('Dogecoin', '')
finalitem = newitem.replace('Address', '')
finalitem = finalitem.replace(' ', '')
#Get the profit
sections = soup.find_all(class_='table-striped')
for section in sections:
oldprofit = section.find_all('td')[11].text
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
# Compare profit to goal
goal = float(50000)
if profit < goal:
continue
if table:
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append([td.text for td in row.find_all('td')])
path = '/Users/chris/Desktop/cd'
fcsv = csv.writer(open(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
You can have automatic file closing using a with statement:
with open(f'{finalitem}.csv', 'w', newline='') as csvfile:
fcsv = csv.DictWriter(csvfile, fieldnames=headers) # assuming headers is a list object
fcsv.writeheader()
fcsv.writerows(datarows)
The error that you are having is that you are wrapping file.path() with elements that should be part of the open function and that you may be wanting to refer to the path variable instead of a path() function for your naming/downloading path.
Related
I am trying get the headlines of each and everyday from economic times India from 2020-01-01 to 2020-12-31, this is what I have tried:
import requests
from bs4 import BeautifulSoup
import time
import datetime
from dateutil import rrule
from calendar import monthrange
import csv
def read_url(year, month, starttime):
url = f'https://economictimes.indiatimes.com/archivelist/year-{year},month-{month},starttime-{starttime}.cms'
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to retrieve data from the website. Response status code: {response.status_code}")
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_starttime(year,month,day):
date1= '1990-12-30'
timestamp1 = time.mktime(datetime.datetime.strptime(date1, '%Y-%m-%d').timetuple())
date2 = str(year) + '-' + str(month) + '-' + str(day)
timestamp2 = time.mktime(datetime.datetime.strptime(date2, '%Y-%m-%d').timetuple())
starttime=((timestamp2 - timestamp1)/86400)
return str(starttime).replace(".0", "")
headlines_from = '2020-01-01'
headlines_to = '2020-10-31'
headlines_datetime_from = datetime.datetime.strptime(headlines_from, '%Y-%m-%d')
headlines_datetime_to = datetime.datetime.strptime(headlines_to, '%Y-%m-%d')
for dt in rrule.rrule(rrule.MONTHLY,dtstart= headlines_datetime_from,until=headlines_datetime_to):
year = int(dt.strftime('%Y'))
month = int(dt.strftime('%m'))
for day in range (1,(monthrange(year,month)[1]+1)):
starttime = get_starttime(year,month,day)
data_str_eng = str(year) + '-'+ '{:02d}'.format(month) + '-' +'{:02d}'.format(day)
headlines = []
soup = read_url(year,month,starttime)
for td in soup.findAll('td',{'class':'contentbox5'}):
for headline in td.findAll('a'):
if 'archive' not in headline.get('href'):
if len(headline.contents)>0:
if headline.contents[0] not in headlines:
headlines.append(headlines.contents[0])
time.sleep(1)
file = open(f'C:/Users/somar/OneDrive - Technological University of the Shannon Midwest/mythesis/mynew thesis topic/economic_news_headlines_{data_str_eng}.csv', 'w')
with file:
write = csv.writer(file, escapechar = '\\' , quoting = csv.QUOTE_NONE)
for item in headlines:
write.writerow([item,])
The code is running properly but i am getting 0kb data files.
This works for me. The issue is probably with code that creates the dates for your url. Perhaps those urls are sending you to an "empty" page that doesn't contain the html your searching for. Respectfully, it seems like you have spent little time troubleshooting your code, so please do so. If you run into other problems come back and I'll help you out.
Note: I changed read_url to take url's just for the sake of trouble shooting. I also passed 2 new arguments to the open function to fix an encoding error and to not have a newline every other row (in the csv), respectively.
def read_url(url):
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to retrieve data from the website. Response status code: {response.status_code}")
soup = BeautifulSoup(response.text, 'html.parser')
return soup
if __name__ == '__main__':
url = "https://economictimes.indiatimes.com/archivelist/year-2022,month-12,starttime-44911.cms"
# get sites page source, pass it to bs obj, return bs obj
soup = read_url(url)
headlines = []
for td in soup.findAll('td',{'class':'contentbox5'}):
for headline in td.findAll('a'):
if 'archive' not in headline.get('href'):
if len(headline.contents)>0:
if headline.contents[0] not in headlines:
headlines.append(headline.contents[0])
# print(headlines)
# write to file using utf-8 encoding and without adding a newline every other row
file = open(f'PATH/economic_news_headlines.csv', 'w', encoding="utf-8", newline='')
with file:
write = csv.writer(file, escapechar = '\\' , quoting = csv.QUOTE_NONE)
for item in headlines:
write.writerow([item,])
So Im learning more about python everyday. Im doing a mini web scrape project and at the very end when I should see the results on an exported csv - it comes up blank except for the headers. Any help is gladly appreciated! Thanks.
The code is below:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.boxofficemojo.com/year/"
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
box_office_table = soup.find("div", class_="a-section mojo-body aok-relative").find_all("tr")
with open('imdbmovies.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
# Write headers to CSV file
writer.writerow(['numone_release', 'year', 'total_gross', 'releases', 'average', 'gross_change'])
for row in box_office_table:
try:
year_cell = row.find("td", class_="a-text-left mojo-header-column mojo-field-type-year mojo-sort-column")
money_cells = row.find_all("td", class_="a-text-right mojo-field-type-money")
releases_cell = row.find("td", class_="a-text-right mojo-field-type-positive_integer")
gross_change_cell = row.find("td", class_="a-text-right mojo-number-delta mojo-field-type-percent_delta")
numone_release_cell = row.find("td", class_="a-text-left mojo-field-type-release mojo-cell-wide")
if len(money_cells) >= 2 and year_cell is not None and releases_cell is not None and gross_change_cell is not None and numone_release_cell is not None:
total_gross_cell = money_cells[0]
average_cell = money_cells[1]
year = year_cell.text.strip()
total_gross = total_gross_cell.text.strip()
releases = releases_cell.text.strip()
average = average_cell.text.strip()
gross_change = gross_change_cell.text.strip()
numone_release = numone_release_cell.text.strip()
print(year, total_gross, releases, average, gross_change, numone_release)
# Write the row to the CSV file
writer.writerow([numone_release, year, total_gross, releases, average, gross_change])
except AttributeError:
# Either a cell is not found
pass
This is my first time using BeautifulSoup and I am attempting to scrap store location data from a local convenience store.
However I'm running into some issues on trying to remove empty lines when data is being passed into a CSV file, I've tried .replace('\n','') and .strip() both did not worked.
Also I'm having problems with splitting data that is scraped and contained in the same sibling method.
I've added the script below:
from bs4 import BeautifulSoup
from requests import get
import urllib.request
import sched, time
import csv
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#print (soup.prettify())
#open a file for writing
location_data = open('data/soupdata.csv', 'w', newline='')
#create the csv writer object
csvwriter = csv.writer(location_data)
cheers = soup.find('div' , id="store_container")
count = 0
#Loop for Header tags
for paragraph in cheers.find_all('b'):
header1 = paragraph.text.replace(':' , '')
header2 = paragraph.find_next('b').text.replace(':' , '')
header3 = paragraph.find_next_siblings('b')[1].text.replace(':' , '')
if count == 0:
csvwriter.writerow([header1, header2, header3])
count += 1
break
for paragraph in cheers.find_all('br'):
brnext = paragraph.next_sibling.strip()
brnext1 = paragraph.next_sibling
test1 = brnext1.next_sibling.next_sibling
print(test1)
csvwriter.writerow([brnext, test1])
location_data.close()
Sample of output generated:
Sample of what output should look like:
How can I achieve this?
Thanks in advance.
To make it slightly organized, you can try like the following. I've used .select() instead of .find_all().
import csv
from bs4 import BeautifulSoup
import requests
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
with open("output.csv","w",newline="") as infile:
writer = csv.writer(infile)
writer.writerow(["Address","Telephone","Store hours"])
for items in soup.select("#store_container .store_col"):
addr = items.select_one("b").next_sibling.next_sibling
tel = items.select_one("b:nth-of-type(2)").next_sibling
store = items.select_one("b:nth-of-type(3)").next_sibling
writer.writerow([addr,tel,store])
You just need to change the way of extracting address, tel and store hours
import csv
from bs4 import BeautifulSoup
from requests import get
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# print (soup.prettify())
# open a file for writing
location_data = open('data/soupdata.csv', 'w', newline='')
# create the csv writer object
csvwriter = csv.writer(location_data)
cheers = soup.find('div', id="store_container")
count = 0
# Loop for Header tags
for paragraph in cheers.find_all('b'):
header1 = paragraph.text.replace(':', '')
header2 = paragraph.find_next('b').text.replace(':', '')
header3 = paragraph.find_next_siblings('b')[1].text.replace(':', '')
if count == 0:
csvwriter.writerow([header1, header2, header3])
count += 1
break
for paragraph in cheers.find_all('div'):
label = paragraph.find_all('b')
if len(label) == 3:
print(label)
address = label[0].next_sibling.next_sibling
tel = label[1].next_sibling
hours = label[2].next_sibling
csvwriter.writerow([address, tel, hours])
location_data.close()
I'm trying to create a text-delimited file containing the data from the "Actions" table on webpages like this one: http://stats.swehockey.se/Game/Events/300978
I would like each line to include the game # (from the end of the URL) and then the text from the line on the table. For example:
300972 | 60:00 | GK Out | OHK | 33. Hudacek, Julius
I haven't been able to get each row to actually separate. I've tried parsing through each row and column, using a list of stripped strings, and searching by different tags, classes, and styles.
Here's what I currently have:
from bs4 import BeautifulSoup
import urllib.request
def createtext():
gamestr = urlstr + "|"
#Find all table lines. Create one pipe-delimited line for each.
aptext = gamestr
for el in soup.find_all('tr'):
playrow = el.find_all('td', 'tdOdd')
for td in playrow:
if(td.find(text=True)) not in ("", None, "\n"):
aptext = aptext + ''.join(td.text) + "|"
aptext = aptext + "\n" + gamestr
#Creates file with Game # as filename and writes the data to the file
currentfile = urlstr + ".txt"
with open(currentfile, "w") as f:
f.write(str(aptext))
#Grabs the HTML file and creates the soup
urlno = 300978
urlstr = str(urlno)
url = ("http://stats.swehockey.se/Game/Events/" + urlstr)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
pbpdoc = response.read().decode('utf-8')
soup = BeautifulSoup(pbpdoc)
createtext()
Thanks for any help or guidance!
First of all, you don't have to construct the CSV data manually, Python provides a built-in csv module for that.
Then, since you are up to "actions" only, I'd identify the "actions" table and find the events-only rows. This can be done with the help of a filtering function checking the first cell to not be empty:
import csv
from bs4 import BeautifulSoup
import requests
def only_action_rows(tag):
if tag.name == 'tr':
first_cell = tag.find('td', class_='tdOdd')
return first_cell and first_cell.get_text(strip=True)
event_id = 300978
url = "http://stats.swehockey.se/Game/Events/{event_id}".format(event_id=event_id)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
actions_table = soup.find("h2", text="Actions").find_parent("table")
data = [[event_id] + [td.get_text(strip=True) for td in row.find_all('td', class_='tdOdd')]
for row in actions_table.find_all(only_action_rows)]
with open("output.csv", "w") as f:
writer = csv.writer(f)
writer.writerows(data)
Note that I'm using requests here.
I am writing a scraper and I want loop through a list of links and merge all results as columns to a dataframe on same key (like a left join).
I run this code in the Ipython Notebook, the resulting csv that comes from the dataframe does not make sense, however if after running the script I merge df and df2 on the mutual column "questions", I get the join that I need, but in the script there's something wrong.
Here's the whole script, there's log in with requests but you don't have to make a user, you can run it as a guest and you just won't get all answers in the review.
import requests
from bs4 import BeautifulSoup as bs
import csv
import pandas as pd
get_url = 'https://www.g2crowd.com/login?form=login'
post_url = 'https://www.g2crowd.com/user_sessions'
review_url = 'https://www.g2crowd.com/survey_responses/salesforce-crm-review-29972'
links = []
with open("links.csv", "r") as f:
spamreader = csv.reader(f, delimiter=',')
for row in spamreader:
links.append(row)
links = links[1:]
s = requests.Session()
r = s.get(get_url)
soup = bs(r.text)
token = soup.select('input[name="authenticity_token"]')[0]['value']
username = 'email#gmail.com'
password = 'password'
payload = {"user_session[login_email]": "email#gmail.com", "user_session[password]": "password"}
payload['authenticity_token'] = token
Referer = dict(Referer=get_url)
r = s.post(post_url, data=payload, headers=Referer)
print r.status_code
df = pd.read_csv("data.csv")
#df = df.set_index('questions')
for link in links:
r = s.get(link[0])
soup = bs(r.text)
title = soup.title.contents[0]
question_wrapper = soup.findAll("div", class_="question-wrapper")
print len(question_wrapper)
questions = []
answers = []
scraped_titles = []
tricky_title = 'Salesforce CRM Review by G2 Crowd User in Transportation/Trucking/Railroad - March 13, 2013'
with open("scraped_titles.csv", "r") as f:
spamreader = csv.reader(f, delimiter=',')
for row in spamreader:
scraped_titles.append(row[0])
scraped_titles = set(scraped_titles)
if (title not in scraped_titles and title != tricky_title):
for question in question_wrapper:
q = question.label.contents[0]
a = question.div.contents[0].text
questions.append(q)
answers.append(a)
#qa = zip(questions, answers)
qa = dict(questions=questions, answers=answers)
df2 = pd.DataFrame(qa)
#df2 = df2.set_index('questions', inplace=True)
#df2.to_csv(title + ".csv", encoding='utf-8')
df = pd.merge(df, df2, how='left', on='questions')
with open("scraped_titles.csv", "a") as csvwriter:
spamreader = csv.writer(csvwriter, delimiter=',')
spamreader.writerow([unicode(title).encode("utf-8")])
else:
pass
df.to_csv("all_data.csv", encoding='utf-8')
I also tried to save every review to .csv and then merge everything with Pandas, but I get a weird, rare undocumented error:
Error: new-line character seen in unquoted field - do you need to open
the file in universal-newline mode?
I have been trying to find my error for a pretty while, if somebody can point it out it would be extremely helpful.
Also, I hope I have formatted the post according to the rules, if not please help me to correct it.