Beautifulsoup is creating an empty csv file - python

import requests
from bs4 import BeautifulSoup
import csv
# Set the URL to scrape
url = 'https://www.booking.com/searchresults.en-gb.html?ss=Hurghada&sb=1'
# Send a request to the URL and get the page content
response = requests.get(url)
content = response.content
# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup (content)
print(soup)
# Find the hotel elements
hotels = soup.find_all('div', {'data-testid="property-card"'})[:10]
print(hotels)
# Create a CSV file to save the hotel data
csv_file = open ( 'booking.csv' , 'w' , newline='' , encoding='utf-8' )
writer = csv.writer ( csv_file )
# Write the header row to the CSV file
writer.writerow ( [ 'Hotel Name' , 'Score' , 'Review Count' , 'Price' ] )
# Loop through the first 10 hotels and extract the data
for hotel in hotels [ :10 ]:
# Extract the hotel name
name = hotel.find ( 'div' , class_='sr-hotel__name' ).text.strip ( )
# Extract the hotel score
score = hotel.find ( 'div' , class_='bui-review-score__badge' ).text.strip ( )
# Extract the review count
review_count = hotel.find ( 'div' , class_='bui-review-score__text' ).text.strip ( )
# Extract the price
price = hotel.find ( 'div' , class_='sr_rooms_table_block__prices' ).text.strip ( )
# Write the hotel data to the CSV file
writer.writerow ( [ name , score , review_count , price ] )
# Close the CSV file
csv_file.close ( )
I am trying to make a csv file that has the following for the top 10 results:
1- Hotel Name
2- Rating Score, example 8.7 or 8
3- Rating Value, example Excellent or Very Good
4- Number of Reviews
The csv is coming up empty, and i cant find a way to make it work.

Always and first of all, take a look at your soup to see if all the expected ingredients are in place.
There are three main issues that causes the behavior leading to an empty CSV:
Add a user-agent to your request headers, to get expected source:
requests.get(url,headers={'user-agent':'some agent'})
Check your selection and provide attributes in propper syntax:
hotels = soup.find_all('div', {'data-testid':"property-card"})
Before apply a methode check that an element exists.
May check also existing answers for scraping booking.com that deals with alternative selections.

Related

Use Beautiful Soup to unify #text after a tag

I'm using Beautiful Soup to put in a excel table some infos from a website.
The bold titles are shown in the head columns while the text after the colon appear in the rows.
What I'm doing is finding the text and searching for next_sibling -->
book_year = sibling.pre.find('b',text='Anno:').next_sibling.get_text().strip()
The problem is that in some cases the text after colon, is split in different #text part. So if I use the next_sibling, it'll get only a partial info.
As you can see in the inspector, the content of Titoli originali: will only be "da" if I use next_sibling.
Is there a way to unify all those #text parts? How would you approach this problem?
Thank you
UPDATES:
This is the website I'm scraping from --> http://www.letteraturenordiche.it/danimarca.htm
It's giving me a hard time cause it has an incoherent structure and no use of classes.
One thing I did is to remove from the <pre> content all of the tags, <font> tags and <span> tags, to leave only the <b> ones and take the text after that.
Parsing this document isn't pretty. Probable the document is hand-written in Word and then exported to HTML:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "http://www.letteraturenordiche.it/danimarca.htm"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
# preprocess the document:
# remove all whitespaces:
for w in soup.find_all(text=True):
if not w.strip():
w.extract()
# unwrap not necessary tags:
for t in soup.select("i, font, span"):
t.unwrap()
# merge NavigableStrings together:
soup.smooth()
data = []
for t in soup.select("table"):
title = t.p.get_text(separator=" ", strip=True)
year = (
t.select_one('b:-soup-contains("Anno:")')
.find_next_sibling(text=True)
.strip()
)
author = (
t.find_previous("hr", attrs={"size": "6"})
.find_previous("p")
.get_text(strip=True)
)
editor = (
t.select_one('b:-soup-contains("Editore:")')
.find_next_sibling(text=True)
.strip()
)
pages = (
t.select_one('b:-soup-contains("Pagine:")')
.find_next_sibling(text=True)
.strip()
)
notes = (
t.select_one('b:-soup-contains("Note:", "Comprende")')
.find_next_sibling(text=True)
.strip()
)
original_title = t.select_one(
'b:-soup-contains("Titolo Original", "Titolo original", "Titoli originali")'
)
if not original_title:
original_title = t.find(lambda t: t.text.strip() == ":")
if not original_title:
original_title = ""
else:
original_title = original_title.find_next_sibling(text=True).strip()
data.append((title, year, author, editor, pages, notes, original_title))
df = pd.DataFrame(
data,
columns=[
"title",
"year",
"author",
"editor",
"pages",
"notes",
"original_title",
],
)
df["title"] = df["title"].str.replace(r"\r?\n", " ", regex=True)
df["author"] = df["author"].str.replace(r"\r?\n", " ", regex=True)
print(df)
df.to_csv("data.csv", index=False)
Creates the dataframe and saves it as data.csv (screenshot from LibreOffice):

Beautiful Soup to Scrape Data from Static Webpages

I am trying to values from a table of multiple static webpages. It is the verb conjugation data for Korean verbs here: https://koreanverb.app/
My Python script uses Beautiful Soup. The goal is to grab all conjugations from multiple URL inputs and output the data to a CSV file.
Conjugations are stored on the page in table with class "table-responsive" and under the table rows with class "conjugation-row". There are multiple "conjugation-row" table rows on each page. My script is someone only grabbing the first table row with class "conjugation-row".
Why isn't the for loop grabbing all the td elements with class "conjugation-row"? I would appreciate a solution that grabs all tr with class "conjugation-row". I tried using job_elements = results.find("tr", class_="conjugation-row"), but I get the following error:
AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Furthermore, when I do get the data and output to a CSV file, the data is in separate rows as expected, but leaves empty spaces., It places the data rows for the second URL at the index after all data rows for the first URL. See example output here:
See code here:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
# create csv file
outfile = open("scrape.csv","w",newline='')
writer = csv.writer(outfile)
## define first URL to grab conjugation names
url1 = 'https://koreanverb.app/?search=%ED%95%98%EB%8B%A4'
# define dataframe columns
df = pd.DataFrame(columns=['conjugation name'])
# get URL content
response = requests.get(url1)
soup = BeautifulSoup(response.content, 'html.parser')
# get table with all verb conjugations
results = soup.find("div", class_="table-responsive")
##### GET CONJUGATIONS AND APPEND TO CSV
# define URLs
urls = ['https://koreanverb.app/?search=%ED%95%98%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A8%B9%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A7%88%EC%8B%9C%EB%8B%A4']
# loop to get data
for url in urls:
response = requests.get(url)
soup2 = BeautifulSoup(response.content, 'html.parser')
# get table with all verb conjugations
results2 = soup2.find("div", class_="table-responsive")
# get dictionary form of verb/adjective
verb_results = soup2.find('dl', class_='dl-horizontal')
verb_title = verb_results.find('dd')
verb_title_text = verb_title.text
job_elements = results2.find_all("tr", class_="conjugation-row")
for job_element in job_elements:
conjugation_name = job_element.find("td", class_="conjugation-name")
conjugation_korean = conjugation_name.find_next_sibling("td")
conjugation_name_text = conjugation_name.text
conjugation_korean_text = conjugation_korean.text
data_column = pd.DataFrame({ 'conjugation name': [conjugation_name_text],
verb_title_text: [conjugation_korean_text],
})
#data_column = pd.DataFrame({verb_title_text: [conjugation_korean_text]})
df = df.append(data_column, ignore_index = True)
# save to csv
df.to_csv('scrape.csv')
outfile.close()
print('Verb Conjugations Collected and Appended to CSV, one per column')
Get all the job_elements using find_all() since find() only returns the first occurrence and iterate over them in a for loop like below.
job_elements = results.find_all("tr", class_="conjugation-row")
for job_element in job_elements:
conjugation_name = job_element.find("td", class_="conjugation-name")
conjugation_korean = conjugation_name.find_next_sibling("td")
conjugation_name_text = conjugation_name.text
conjugation_korean_text = conjugation_korean.text
# append element to data
df2 = pd.DataFrame([[conjugation_name_text,conjugation_korean_text]],columns=['conjugation_name','conjugation_korean'])
df = df.append(df2)
The error is where you are trying to use find() on a variable of type list.
As your script is growing big, I made some modifications like using get_conjugations() function and some proper names that are easy to understand. Firstly, conjugation_names and conjugation_korean_names are added into pandas Dataframe columns and then other columns are added subsequently (korean0, korean1 ...).
import requests
from bs4 import BeautifulSoup
import pandas as pd
# function to parse the html data & get conjugations
def get_conjugations(url):
#set return lists
conjugation_names = []
conjugation_korean_names = []
#get html text
html = requests.get(url).text
#parse the html text
soup = BeautifulSoup(html, 'html.parser')
#get table
table = soup.find("div", class_="table-responsive")
table_rows = table.find_all("tr", class_="conjugation-row")
for row in table_rows:
conjugation_name = row.find("td", class_="conjugation-name")
conjugation_korean = conjugation_name.find_next_sibling("td")
conjugation_names.append(conjugation_name.text)
conjugation_korean_names.append(conjugation_korean.text)
#return both lists
return conjugation_names, conjugation_korean_names
# create csv file
outfile = open("scrape.csv", "w", newline='')
urls = ['https://koreanverb.app/?search=%ED%95%98%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A8%B9%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A7%88%EC%8B%9C%EB%8B%A4']
# define dataframe columns
df = pd.DataFrame(columns=['conjugation_name', 'conjugation_korean', 'korean0', 'korean1'])
conjugation_names, conjugation_korean_names = get_conjugations(urls[0])
df['conjugation_name'] = conjugation_names
df['conjugation_korean'] = conjugation_korean_names
for index, url in enumerate(urls[1:]):
conjugation_names, conjugation_korean_names = get_conjugations(url)
#set column name
column_name = 'korean' + str(index)
df[column_name] = conjugation_korean_names
#save to csv
df.to_csv('scrape.csv')
outfile.close()
# Print DONE
print('Export to CSV Complete')
Output:
,conjugation_name,conjugation_korean,korean0,korean1
0,declarative present informal low,해,먹어,마셔
1,declarative present informal high,해요,먹어요,마셔요
2,declarative present formal low,한다,먹는다,마신다
3,declarative present formal high,합니다,먹습니다,마십니다
...
Note:
This assumes that elements in different URLs are in same order.

formatting data to csv file

I wrote this page scraper using python and beautiful soup to extract data from a table and now want to save it. The area i scraped is the table on the right hand side of the website. I need the bold part on the left side to correspond to the right side, so key people to correspond to ceo for example. New to this, need some advice on the best way to format this. Thank you.
import requests
import csv
from datetime import datetime
from bs4 import BeautifulSoup
# download the page
myurl = requests.get("https://en.wikipedia.org/wiki/Goodyear_Tire_and_Rubber_Company")
# create BeautifulSoup object
soup = BeautifulSoup(myurl.text, 'html.parser')
# pull the class containing all tire name
name = soup.find(class_ = 'logo')
# pull the div in the class
nameinfo = name.find('div')
# just grab text inbetween the div
nametext = nameinfo.text
# print information about goodyear logo on wiki page
#print(nameinfo)
# now, print type of company, private or public
#status = soup.find(class_ = 'category')
#for link in soup.select('td.category a'):
#print link.text
# now get the ceo information
#for employee in soup.select('td.agent a'):
#print employee.text
# print area served
#area = soup.find(class_ = 'infobox vcard')
#print(area)
# grab information in bold on the left hand side
vcard = soup.find(class_ = 'infobox vcard')
rows = vcard.find_all('tr')
for row in rows:
cols=row.find_all('th')
cols=[x.text.strip() for x in cols]
print cols
# grab information in bold on the right hand side
vcard = soup.find(class_ = 'infobox vcard')
rows = vcard.find_all('tr')
for row in rows:
cols2=row.find_all('td')
cols2=[x.text.strip() for x in cols2]
print cols2
# save to csv file named index
with open('index.csv', 'w') as csv_file:
writer = csv.writer(csv_file) # actually write to the file
writer.writerow([cols,cols2 , datetime.now()]) # apprend time
You need to reorder your code a bit. It is also possible to find both tr and th at the same time which would solve your problem of the two columns needing to be in sync:
import requests
import csv
from datetime import datetime
from bs4 import BeautifulSoup
myurl = requests.get("https://en.wikipedia.org/wiki/Goodyear_Tire_and_Rubber_Company")
soup = BeautifulSoup(myurl.text, 'html.parser')
vcard = soup.find(class_='infobox vcard')
with open('output.csv', 'wb') as f_output:
csv_output = csv.writer(f_output)
for row in vcard.find_all('tr')[1:]:
cols = row.find_all(['th', 'td'])
csv_output.writerow([x.text.strip().replace('\n', ' ').encode('ascii', 'ignore') for x in cols] + [datetime.now()])
This would create an output.csv file such as:
Type,Public,2018-03-27 17:12:45.146000
Tradedas,NASDAQ:GT S&P 500 Component,2018-03-27 17:12:45.147000
Industry,Manufacturing,2018-03-27 17:12:45.147000
Founded,"August29, 1898; 119 years ago(1898-08-29) Akron, Ohio, U.S.",2018-03-27 17:12:45.147000
Founder,Frank Seiberling,2018-03-27 17:12:45.147000
Headquarters,"Akron, Ohio, U.S.",2018-03-27 17:12:45.148000
Area served,Worldwide,2018-03-27 17:12:45.148000
Key people,"Richard J. Kramer (Chairman, President and CEO)",2018-03-27 17:12:45.148000
Products,Tires,2018-03-27 17:12:45.148000
Revenue,US$ 15.158 billion[1](2016),2018-03-27 17:12:45.149000
Operating income,US$ 1.52 billion[1](2016),2018-03-27 17:12:45.149000
Net income,US$ 1.264 billion[1](2016),2018-03-27 17:12:45.149000
Total assets,US$ 16.511 billion[1](2016),2018-03-27 17:12:45.150000
Total equity,US$ 4.507 billion[1](2016),2018-03-27 17:12:45.150000
Number of employees,"66,000[1](2017)",2018-03-27 17:12:45.150000
Subsidiaries,List of subsidiaries,2018-03-27 17:12:45.151000
Website,goodyear.com,2018-03-27 17:12:45.151000

Trying to parse table data into csv file. Is there a way to parse dynamic genrated table data in a row in csv with BeautifulSoup python

I have a list of names and trying to parse whole table content in a row of a with Xpath. In some name if there is less content my webdriver crushed and programs stops, So I decided parse table with pandas. I did my research to parse table with pandas into csv file. But don't know how to implement it.
here is the link of table I am trying to parse in a row in csv
DLLC , ACT , OREGON , 11-25-2015 , 11-25-2017 , PPB , PRINCIPAL PLACE OF BUSINESS , 22325 SW MURPHY ST,BEAVERTON , OR and so on.
see every data field from that table will be look like this in excel in each cell. I don't want any header. I just table data in row.
Now I have list of names in csv something like this:
HALF MOON BEND FARM, LLC
NICELY GROWN LLC
COPR INCORPORATED
so on......
Here is the code:
from selenium import webdriver
from bs4 import BeautifulSoup
import lxml
import time
import csv
driver = webdriver.Chrome()
driver.get("url")
#time.sleep(5)
username = driver.find_element_by_name("p_name")
#time.sleep(1)
username.send_keys("xxxxxxx")
#username.clear()
driver.find_element_by_xpath("html/body/form/table[6]/tbody/tr/td[2]/input").click()
entity= driver.find_element_by_partial_link_text("xxxxxxx")
entity.click()
html = driver.page_source
Registry_nbr = driver.find_element_by_xpath("html/body/form/table[2]/tbody/tr[2]/td[1]").text
Entity_type = driver.find_element_by_xpath("html/body/form/table[2]/tbody/tr[2]/td[2]").text
Entity_status = driver.find_element_by_xpath("html/body/form/table[2]/tbody/tr[2]/td[3]").text
Registry_date = driver.find_element_by_xpath("html/body/form/table[2]/tbody/tr[2]/td[6]").text
#Next_renewal_date = driver.find_element_by_xpath("html/body/form/table[2]/tbody/tr[2]/td[6]").text
entity_name = driver.find_element_by_xpath("html/body/form/table[3]/tbody/tr/td[2]").text
Ttest=driver.find_element_by_xpath("html/body/form/table[32]/tbody/tr/td[2]").text
with open("sos.csv", "w") as scoreFile:
scoreFileWriter = csv.writer(scoreFile)
scoreFileWriter.writerow([Registry_nbr,Entity_type,Entity_status,Registry_date,entity_name],)
scoreFile.close()
soup =BeautifulSoup(html)
for tag in soup.find_all('table'):
print tag.text
Use this after entity.click()
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
words = soup.find_all("td")
table_data = soup.get_text().encode('utf-8')
word = list()
for cell in words:
a.append((cell.text).encode('utf-8'))
with open('name.csv', 'w') as csvfile:
spamwriter = csv.writer(csvfile,delimiter=',')
spamwriter.writerow(word)
hope this will help
Once you have the html you can parse it using BeautifulSoup and find the table you want. Looking at the HTML page you reference, I do not see any classid's or identifying keys to search for so just indexing in to table[2] will have to do.
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
NBSP = u'\xa0'
tables = [ [ map(lambda d: d.text.replace(NBSP, u''), r.findAll('td'))
for r in t.findAll('tr') ]
for t in soup.findAll('table') ]
business_entity_data = tables[2]
keys = business_entity_data[0]
with open('page.csv', 'wb') as csvfile:
csvwriter = csv.DictWriter(csvfile, keys)
csvwriter.writeheader()
csvwriter.writerow(dict(zip(keys, business_entity_data[1])))
You should end up with a file containing:
Registry Nbr,Entity Type,Entity Status,Jurisdiction,Registry Date,Next Renewal Date,Renewal Due?
1164570-94,DLLC,ACT,OREGON,11-25-2015,11-25-2017,

How to scrape websites with Python and beautiful soup

I am trying to scrape results from the bbc sport website. I've got the scores working but when trying to add team names the program prints out none 1-0 none (for example). This is the code:
from bs4 import BeautifulSoup
import urllib.request
import csv
url = 'http://www.bbc.co.uk/sport/football/teams/derby-county/results'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page)
for match in soup.select('table.table-stats tr.report'):
team1 = match.find('span', class_='team-home')
team2 = match.find('span', class_='team-away')
score = match.abbr
print(team1.string, score.string, team2.string)
It looks like you are searching for tags that are not there. For instance class_="team-home teams" is in the html, but class_='team-home' is not. The following code prints the first team name:
tables = soup.find_all("table", class_="table-stats")
tables[0].find("span", class_="team-home teams").text
# u' Birmingham '
Here is a possible solution which gets the home and away team names, the final score, the match date and the competition name via BeautifulSoup and puts it in a DataFrame.
import requests
import pandas as pd
from bs4 import BeautifulSoup
#Get the relevant webpage set the data up for parsing
url = "http://www.bbc.co.uk/sport/football/teams/derby-county/results"
r = requests.get(url)
soup=BeautifulSoup(r.content,"lxml")
#set up a function to parse the "soup" for each category of information and put it in a DataFrame
def get_match_info(soup,tag,class_name,column_name):
info_array=[]
for info in soup.find_all('%s'%tag,attrs={'class':'%s'%class_name}):
info_array.append({'%s'%column_name:info.text})
return pd.DataFrame(info_array)
#for each category pass the above function the relevant information i.e. tag names
date = get_match_info(soup,"td","match-date","Date")
home_team = get_match_info(soup,"span","team-home teams","Home Team")
score = get_match_info(soup,"span","score","Score")
away_team = get_match_info(soup,"span","team-away teams","Away Team")
competition = get_match_info(soup,"td","match-competition","Competition")
#Concatenate the DataFrames to present a final table of all the above info
match_info = pd.concat([date,home_team,score,away_team,competition],ignore_index=False,axis=1)
print match_info

Categories

Resources