Why i am getting only the last output in csv? - python

I have a bunch of urls in a csv file and I have to extract data from those urls into another csv file. I extracted the data from those urls into a dataframe using my code below, but when it comes to save those extracted data into output csv, it only shows me the last extracted data (i.e if I have 10 urls in demo.csv, only the extracted data of 10th url is seen in the output csv, not all the urls' data).
import csv
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
with open('demo.csv', newline='') as f:
reader = csv.reader(f)
for row in reader:
url = row[0]
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36/8mqHiSuL-56"}
response = requests.get(url, headers= header)
print(url)
soup = BeautifulSoup(response.content, "html.parser")
website= soup.find('div', class_="arrange__373c0__UHqhV gutter-2__373c0__3Zpeq vertical-align-middle__373c0__2TQsQ border-color--default__373c0__2oFDT")
if website is None:
website = '-'
else:
website = website.text.replace('Business website','')
print(website)
time.sleep(2)
dict = {'url': [url], 'website': [website]}
df = pd.DataFrame(dict)
df.to_csv('export_dataframe.csv', index= False)

The problem seems to be the identation of the line where you are adding your data into a dict. It is outside the loop, therefore only adds the last url data. I have pointed it out by comment in the code below.
import csv
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
with open('demo.csv', newline='') as f:
reader = csv.reader(f)
for row in reader:
url = row[0]
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36/8mqHiSuL-56"}
response = requests.get(url, headers= header)
print(url)
soup = BeautifulSoup(response.content, "html.parser")
website= soup.find('div', class_="arrange__373c0__UHqhV gutter-2__373c0__3Zpeq vertical-align-middle__373c0__2TQsQ border-color--default__373c0__2oFDT")
if website is None:
website = '-'
else:
website = website.text.replace('Business website','')
print(website)
time.sleep(2)
data.append([url, website]) # this line is out of loop in your code, also I am using list here just to simplify (you can use dict still)
df = pd.DataFrame(data, columns=['url','website'])
df.to_csv('export_dataframe.csv', index= False)

Related

Save .csv file with names in first column and links in second column (IndentationError: expected an indented block)

When trying to run the script the following error appears:
line 16
for tag in jogos:
^
IndentationError: expected an indented block
My expected result is:
COLUMN 1 COLUMN 2
Team A v Team B LINK HREF
Team C v Team D LINK HREF
Team E v Team F LINK HREF
Team G v Team H LINK HREF
Another problem is that specifying data delivery with namelist and linkslist is only delivering the first value and not all possible values.
In this answer (https://stackoverflow.com/a/68446386/11462274), via print, like this:
print(tag.find("a", href=True).get_text().strip())
Or this:
print(tag.find("a", href=True)["href"])
The result delivers the complete list of values, but when I define a specific name like namelist and linkslist, it stops returning the complete list and delivers only one value.
Full Script:
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
with open ('Lista_de_Jogos.csv', 'a', newline='', encoding='UTF8') as f:
for tag in jogos:
namelist = tag.find("a", href=True).get_text().strip()
linkslist = tag.find("a", href=True)["href"]
row = namelist + ';' + linkslist + '\n'
f.write(row)
The error message is obvious. Since your using a context manager - with(...) you should write the code within that indentation block.
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
with open("Lista_de_Jogos.csv", "a", newline="", encoding="UTF8") as f:
for tag in jogos:
namelist = tag.find("a", href=True).get_text().strip()
linkslist = tag.find("a", href=True)["href"]
row = namelist + ";" + linkslist + "\n"
f.write(row)
You have to indent the code after the 'with open' statement.
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
print (jogos)
with open ('Lista_de_Jogos.csv', 'a', newline='', encoding='UTF8') as f:
for tag in jogos:
namelist = tag.find("a", href=True).get_text().strip()
linkslist = tag.find("a", href=True)["href"]
row = namelist + ';' + linkslist + '\n'
f.write(row)

Unable to retrieve data for all pages using beautiful soup while writing to CSV

I'm web scraping a particular website which scrapes different currency information. I'm unable to retrieve all data when i write to csv file. Please let me know ho to go about it
Code
lista = ["eur-usd-historical-data","usd-jpy-historical-data",]
listb=[]
link = "https://www.investing.com/currencies/%d"
for k,v in enumerate(lista):
urlConcat=link.replace("%d",lista[k])
k += 1
listb.append(urlConcat)
for url in listb:
lista = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/68.0.3440.84 Safari/537.36',
'accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',}
response = requests.get(url,headers=headers).text
data = BeautifulSoup(response, 'html.parser')
#print(data)
divs = data.findAll("table", {"class": "genTbl closedTbl historicalTbl"})
for div in divs:
row = ''
row = div.findAll("td")
for rows in row:
if(rows.text.find("tr")):
lista.append(rows.text)
final_list = [lista[6*i:6*(i+1)] for i in range(int(round(len(lista)/6)) + 1)]
final_df = pd.DataFrame(final_list,columns=['date','price','opne','high','low','% change'])
final_df['currency'] = url
final_df['currency'] = final_df['currency'].str.split('/').str[-1].str.split('-').str[:2].str.join('-')
print(final_df)
final_df.to_csv('result.csv')
Here the result in CSV consists of data for only 1 currency. and if i print the final_df in a different cell it gives result only for one currency.
What should be done to fix this issue
Try appending to csv may be like:
final_df.to_csv('result.csv', mode='a', header=False)

How can I append these two beautiful soup objects into a dataframe simultaneously?

I'm trying to add the date and dividends off yahoo finance at the same time into a dataframe. This way the dates match together. Not really sure how to do this because I don't know how to use both for loops at the same time. Hoping you might be able to make an attempt. Thank you!
import pandas as pd
from datetime import date
import sys
import requests
from bs4 import BeautifulSoup
url = "https://finance.yahoo.com/quote/MSFT/history?period1=1570838400&period2=1602460800&interval=div%7Csplit&filter=div&frequency=1d&includeAdjustedClose=true"
HEADERS = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
page = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(page.text, "html.parser")
dividend = soup.findAll("strong")
date = soup.find_all('td', class_='Py(10px) Ta(start) Pend(10px)')
dividends = []
for f in dividend:
dividends.append(f.text)
dates = []
for f in date:
dates.append(f.text)
print(dates, dividends)

Data overwrites when export to Excel

I'm scraping a website to gather the ten most recent articles based on some keywords. Once I get my data (keyword used, article name, the URL/hyperlink, and the publication date) I want to write it all to an xls file. So far it only writes the results for the last keyword as opposed to all four, it's just overwriting the same section of the spreadsheet. How can I display my entire list, not just the most recent section?
import requests
from bs4 import BeautifulSoup
import datetime
import xlwt
from xlwt import Formula
today = datetime.date.today().strftime("%Y%m%d")
keywords = ('PNC', 'Huntington', 'KeyCorp', 'Fifth Third')
for keyword in keywords:
keyword.replace("+", " ")
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
def article_fetch(keyword):
url = 'https://www.americanbanker.com/search?query={}'.format(keyword)
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
mylist = []
cols = "KeyWord", "Article", "URL", "Publication Date"
mylist.append(cols)
for articles in soup.find_all("div", "feed-item"):
article = articles.find("h4").text.strip()
timestamp = articles.find("span", "timestamp").text.strip()
article_url = 'https://{}'.format(articles.find("a")["href"][2:])
link = 'HYPERLINK("{}", "Link" )'.format(article_url)
item = [keyword, article, Formula(link), timestamp]
mylist.append(item)
book = xlwt.Workbook()
sheet = book.add_sheet("Articles")
for i, row in enumerate(mylist):
for j, col in enumerate(row):
sheet.write(i, j, col)
book.save("C:\Python\American Banker\American Banker {}.xls".format(today))
for keyword in keywords:
article_fetch(keyword)
print('Workbook Saved')
I would expect to see my entire list, which would have results for all four keywords. However I am only seeing the results for the last keyword.
I've moved Excel file generation to the end of the script:
import requests
from bs4 import BeautifulSoup
import datetime
import xlwt
from xlwt import Formula
today = datetime.date.today().strftime("%Y%m%d")
keywords = ('PNC', 'Huntington', 'KeyCorp', 'Fifth Third')
for keyword in keywords:
keyword.replace("+", " ")
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
def article_fetch(keyword):
url = 'https://www.americanbanker.com/search?query={}'.format(keyword)
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
for articles in soup.find_all("div", "feed-item"):
article = articles.find("h4").text.strip()
timestamp = articles.find("span", "timestamp").text.strip()
article_url = 'https://{}'.format(articles.find("a")["href"][2:])
link = 'HYPERLINK("{}", "Link" )'.format(article_url)
item = [keyword, article, Formula(link), timestamp]
mylist.append(item)
mylist = []
cols = "KeyWord", "Article", "URL", "Publication Date"
mylist.append(cols)
for keyword in keywords:
article_fetch(keyword)
book = xlwt.Workbook()
sheet = book.add_sheet('Articles')
for i, row in enumerate(mylist):
for j, col in enumerate(row):
sheet.write(i, j, col)
book.save("American Banker {}.xls".format(today))
print('Workbook Saved')
Data won't lost anymore:

Want to scrape data using 10 different keywords in URL for 2 pages and write scraped data to csv using Python 3.6.2 and BS4

I have the code ready for one keyword and its working fine. Next problem is I want to do the scrape for 10 different keywords and save them in one csv file with the keyword name on column/row. I think we can give csv file as input and it picks keyword one by one and does scrape. Here is the code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?
rh=n%3A4772060031%2Ck%3Ahelmets+for+men&keywords=helmets+for+men&ie=UTF8"
#excluding page from base_url for further adding
res = []
for page in range(1,3):
request = requests.get(base_url + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Asin', 'Result'])
df.to_csv('hel.csv')
I made some sample keywords, replace on needed ones.
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ie=UTF8"
keywords_list = ['helmets for men', 'helmets for women']
keyword = 'helmets for men'
#excluding page from base_url for further adding
res = []
for page in range(1,3):
for keyword in keywords_list:
request = requests.get(base_url + '&keywords=' + requests.utils.quote(keyword) + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id'), keyword])
df = pd.DataFrame(data=res, columns=['Asin', 'Result', 'keyword'])
df.to_csv('hel.csv')

Categories

Resources