I'm scraping a website to gather the ten most recent articles based on some keywords. Once I get my data (keyword used, article name, the URL/hyperlink, and the publication date) I want to write it all to an xls file. So far it only writes the results for the last keyword as opposed to all four, it's just overwriting the same section of the spreadsheet. How can I display my entire list, not just the most recent section?
import requests
from bs4 import BeautifulSoup
import datetime
import xlwt
from xlwt import Formula
today = datetime.date.today().strftime("%Y%m%d")
keywords = ('PNC', 'Huntington', 'KeyCorp', 'Fifth Third')
for keyword in keywords:
keyword.replace("+", " ")
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
def article_fetch(keyword):
url = 'https://www.americanbanker.com/search?query={}'.format(keyword)
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
mylist = []
cols = "KeyWord", "Article", "URL", "Publication Date"
mylist.append(cols)
for articles in soup.find_all("div", "feed-item"):
article = articles.find("h4").text.strip()
timestamp = articles.find("span", "timestamp").text.strip()
article_url = 'https://{}'.format(articles.find("a")["href"][2:])
link = 'HYPERLINK("{}", "Link" )'.format(article_url)
item = [keyword, article, Formula(link), timestamp]
mylist.append(item)
book = xlwt.Workbook()
sheet = book.add_sheet("Articles")
for i, row in enumerate(mylist):
for j, col in enumerate(row):
sheet.write(i, j, col)
book.save("C:\Python\American Banker\American Banker {}.xls".format(today))
for keyword in keywords:
article_fetch(keyword)
print('Workbook Saved')
I would expect to see my entire list, which would have results for all four keywords. However I am only seeing the results for the last keyword.
I've moved Excel file generation to the end of the script:
import requests
from bs4 import BeautifulSoup
import datetime
import xlwt
from xlwt import Formula
today = datetime.date.today().strftime("%Y%m%d")
keywords = ('PNC', 'Huntington', 'KeyCorp', 'Fifth Third')
for keyword in keywords:
keyword.replace("+", " ")
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
def article_fetch(keyword):
url = 'https://www.americanbanker.com/search?query={}'.format(keyword)
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
for articles in soup.find_all("div", "feed-item"):
article = articles.find("h4").text.strip()
timestamp = articles.find("span", "timestamp").text.strip()
article_url = 'https://{}'.format(articles.find("a")["href"][2:])
link = 'HYPERLINK("{}", "Link" )'.format(article_url)
item = [keyword, article, Formula(link), timestamp]
mylist.append(item)
mylist = []
cols = "KeyWord", "Article", "URL", "Publication Date"
mylist.append(cols)
for keyword in keywords:
article_fetch(keyword)
book = xlwt.Workbook()
sheet = book.add_sheet('Articles')
for i, row in enumerate(mylist):
for j, col in enumerate(row):
sheet.write(i, j, col)
book.save("American Banker {}.xls".format(today))
print('Workbook Saved')
Data won't lost anymore:
Related
I am trying to create a data frame from a dictionary I have and it gives me an error that says:
> ValueError: could not broadcast input array from shape (3) into shape
> (1)
Here is the code:
import requests
from bs4 import BeautifulSoup
from requests.api import request
from selenium import webdriver
from bs4 import Tag, NavigableString
baseurl = "https://www.olx.com.eg/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
product_links = []
for x in range(1,13):
r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers)
soup = BeautifulSoup(r.content, "lxml")
product_list = soup.findAll("div", class_="ads__item")
for item in product_list:
for link in item.findAll("a",href=True):
product_links.append(link['href'])
for thing in product_links:
if '#' in product_links: product_links.remove('#')
# test_link = 'https://www.olx.com.eg/ad/-IDcjqyP.html'
for link in product_links:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
job_title = soup.find('h1',class_="brkword")
job_location = soup.find('strong',class_="c2b")
job_date = soup.find('span',class_="pdingleft10 brlefte5")
try:
seniority = soup.find_all('td',class_='value')[0].text.strip()
except:
print("")
try:
full_or_part = soup.find_all('td',class_='value')[1].text.strip()
except:
print("")
try:
education_level = soup.find_all('td',class_='value')[2].text.strip()
except:
print("")
try:
sector = soup.find_all('td',class_='value')[3].text.strip()
except:
print("")
description = soup.find_all('p',class_='pding10')
df = {
"Job Title" : job_title,
"Job Location" : job_location,
"Post Date" : job_date,
"Seniority Level" : seniority,
"Full or Part time" : full_or_part,
"Educational Level" : education_level,
"Sector" : sector,
"Job Description" : description
}
job_data = pd.DataFrame(df)
Please tell me how I can transform the data I have into a data frame so I can export it into a csv
first of all I was trying to to scrape this jobs website and it scraped it successfully returning 500 jobs in the dictionary but I was unfortunately not able to transform the code into a dataframe, so later on i can export that out to a csv file, so i can do some analysis on it
To create dataframe from the job ads, you can try next example (some column names needs to be renamed from arabic to english though):
import requests
import pandas as pd
from bs4 import BeautifulSoup
baseurl = "https://www.olx.com.eg/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
product_links = []
for x in range(1, 2): # <-- increase the range here
r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers)
soup = BeautifulSoup(r.content, "lxml")
product_list = soup.findAll("div", class_="ads__item")
for item in product_list:
for link in item.findAll("a", href=True):
if link["href"] != "#":
product_links.append(link["href"])
all_data = []
for link in product_links:
print(f"Getting {link} ...")
soup = BeautifulSoup(requests.get(link, headers=headers).content, "lxml")
d = {}
job_title = soup.find("h1").get_text(strip=True)
job_location = soup.find("strong", class_="c2b")
job_date = soup.find("span", class_="pdingleft10 brlefte5")
d["title"] = job_title
d["location"] = job_location.get_text(strip=True) if job_location else "N/A"
d["date"] = job_date.get_text(strip=True) if job_date else "N/A"
for table in soup.select("table.item"):
d[table.th.get_text(strip=True)] = table.td.get_text(strip=True)
all_data.append(d)
job_data = pd.DataFrame(all_data)
print(job_data)
job_data.to_csv("data.csv", index=False)
Creates data.csv (screenshot from LibreOffice):
When trying to run the script the following error appears:
line 16
for tag in jogos:
^
IndentationError: expected an indented block
My expected result is:
COLUMN 1 COLUMN 2
Team A v Team B LINK HREF
Team C v Team D LINK HREF
Team E v Team F LINK HREF
Team G v Team H LINK HREF
Another problem is that specifying data delivery with namelist and linkslist is only delivering the first value and not all possible values.
In this answer (https://stackoverflow.com/a/68446386/11462274), via print, like this:
print(tag.find("a", href=True).get_text().strip())
Or this:
print(tag.find("a", href=True)["href"])
The result delivers the complete list of values, but when I define a specific name like namelist and linkslist, it stops returning the complete list and delivers only one value.
Full Script:
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
with open ('Lista_de_Jogos.csv', 'a', newline='', encoding='UTF8') as f:
for tag in jogos:
namelist = tag.find("a", href=True).get_text().strip()
linkslist = tag.find("a", href=True)["href"]
row = namelist + ';' + linkslist + '\n'
f.write(row)
The error message is obvious. Since your using a context manager - with(...) you should write the code within that indentation block.
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
with open("Lista_de_Jogos.csv", "a", newline="", encoding="UTF8") as f:
for tag in jogos:
namelist = tag.find("a", href=True).get_text().strip()
linkslist = tag.find("a", href=True)["href"]
row = namelist + ";" + linkslist + "\n"
f.write(row)
You have to indent the code after the 'with open' statement.
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
print (jogos)
with open ('Lista_de_Jogos.csv', 'a', newline='', encoding='UTF8') as f:
for tag in jogos:
namelist = tag.find("a", href=True).get_text().strip()
linkslist = tag.find("a", href=True)["href"]
row = namelist + ';' + linkslist + '\n'
f.write(row)
I have a bunch of urls in a csv file and I have to extract data from those urls into another csv file. I extracted the data from those urls into a dataframe using my code below, but when it comes to save those extracted data into output csv, it only shows me the last extracted data (i.e if I have 10 urls in demo.csv, only the extracted data of 10th url is seen in the output csv, not all the urls' data).
import csv
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
with open('demo.csv', newline='') as f:
reader = csv.reader(f)
for row in reader:
url = row[0]
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36/8mqHiSuL-56"}
response = requests.get(url, headers= header)
print(url)
soup = BeautifulSoup(response.content, "html.parser")
website= soup.find('div', class_="arrange__373c0__UHqhV gutter-2__373c0__3Zpeq vertical-align-middle__373c0__2TQsQ border-color--default__373c0__2oFDT")
if website is None:
website = '-'
else:
website = website.text.replace('Business website','')
print(website)
time.sleep(2)
dict = {'url': [url], 'website': [website]}
df = pd.DataFrame(dict)
df.to_csv('export_dataframe.csv', index= False)
The problem seems to be the identation of the line where you are adding your data into a dict. It is outside the loop, therefore only adds the last url data. I have pointed it out by comment in the code below.
import csv
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
with open('demo.csv', newline='') as f:
reader = csv.reader(f)
for row in reader:
url = row[0]
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36/8mqHiSuL-56"}
response = requests.get(url, headers= header)
print(url)
soup = BeautifulSoup(response.content, "html.parser")
website= soup.find('div', class_="arrange__373c0__UHqhV gutter-2__373c0__3Zpeq vertical-align-middle__373c0__2TQsQ border-color--default__373c0__2oFDT")
if website is None:
website = '-'
else:
website = website.text.replace('Business website','')
print(website)
time.sleep(2)
data.append([url, website]) # this line is out of loop in your code, also I am using list here just to simplify (you can use dict still)
df = pd.DataFrame(data, columns=['url','website'])
df.to_csv('export_dataframe.csv', index= False)
I'm web scraping a particular website which scrapes different currency information. I'm unable to retrieve all data when i write to csv file. Please let me know ho to go about it
Code
lista = ["eur-usd-historical-data","usd-jpy-historical-data",]
listb=[]
link = "https://www.investing.com/currencies/%d"
for k,v in enumerate(lista):
urlConcat=link.replace("%d",lista[k])
k += 1
listb.append(urlConcat)
for url in listb:
lista = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/68.0.3440.84 Safari/537.36',
'accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',}
response = requests.get(url,headers=headers).text
data = BeautifulSoup(response, 'html.parser')
#print(data)
divs = data.findAll("table", {"class": "genTbl closedTbl historicalTbl"})
for div in divs:
row = ''
row = div.findAll("td")
for rows in row:
if(rows.text.find("tr")):
lista.append(rows.text)
final_list = [lista[6*i:6*(i+1)] for i in range(int(round(len(lista)/6)) + 1)]
final_df = pd.DataFrame(final_list,columns=['date','price','opne','high','low','% change'])
final_df['currency'] = url
final_df['currency'] = final_df['currency'].str.split('/').str[-1].str.split('-').str[:2].str.join('-')
print(final_df)
final_df.to_csv('result.csv')
Here the result in CSV consists of data for only 1 currency. and if i print the final_df in a different cell it gives result only for one currency.
What should be done to fix this issue
Try appending to csv may be like:
final_df.to_csv('result.csv', mode='a', header=False)
I am trying to scrape the table from the "https://www.nseindia.com/companies-listing/corporate-filings-event-calendar?days=7days" website, where as the python output is scraping the table.
import requests
from bs4 import BeautifulSoup
url = 'https://www.nseindia.com/companies-listing/corporate-filings-event-calendar?days=7days'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
response = requests.get(url, headers=headers)
print(response)
soup = BeautifulSoup(response.text, 'lxml')
print(soup)
data_array = soup.find(id='table-wrap my-3 borderSet maxHeight-900 scrollWrap').get_text().strip().split(":")
type(data_array)
the output is printing the HTML tag instead of the table.
Regards
karthi
If you want table, there's a download link available. It's available as a csv file. You don't need any code. Why don't you just use that?
this code will return you all table as list, put data_table as locator by xpath:
data_table = self.find_element(table_locator).get_attribute('innerHTML').replace('<th></th>', '')
soup = BeautifulSoup(data_table, 'lxml')
data_rows = soup.find_all('tr')
rows_values_scrape = [[td.getText() for td in data_rows[i].findAll('td')]
for i, v in enumerate(data_rows)]
rows_values = [x for x in rows_values_scrape if x]
columns_scrape = [[td.getText() for td in data_rows[i].findAll('th')]
for i, v in enumerate(data_rows)]
columns = [x for x in columns_scrape if x]
table=[]
if columns[1:] != []:
for i, r in enumerate(columns[1:]):
table.append([f'column: {columns[0][j]}, row_title: {columns[1:][i][0]}, cell: {rows_values[i][j]}' for j, c in enumerate(columns[0])])
else:
table=[f'column: {columns[0][j]}, cell: {rows_values[0][j]}' for j, c in enumerate(columns[0]) if columns[1:] == []]
return table