add extracted data as a new column in the existing csv-python - python

Anyone please helps. Please point out where i am wrong when the extracted reviews are written into 3 separate columns in hotelreview.csv, how can i fix this in order to write them into 1 column? and how to add the heading name "review" for it based on the codes below.
And I also want to add the new extracted data ("review" column) into the existing csv 'hotel_FortWorth.csv'. I just added the extracted information into a new csv, i don't know how to combine 2 files together or any other ways? the url can be repeated to match the reviews. Please!
Thank you!
File 'hotel_FortWorth.csv' has 3 columns, for example:
Name link
1 Omni Fort Worth Hotel https://www.tripadvisor.com.au/Hotel_Review-g55857-d777199-Reviews-Omni_Fort_Worth_Hotel-Fort_Worth_Texas.html
2 Hilton Garden Hotel https://www.tripadvisor.com.au/Hotel_Review-g55857-d2533205-Reviews-Hilton_Garden_Inn_Fort_Worth_Medical_Center-Fort_Worth_Texas.html
3......
...
I used the urls from existing csv to extract the reviews, the codes as shown:
import requests
from unidecode import unidecode
from bs4 import BeautifulSoup
import pandas as pd
file = []
data = pd.read_csv('hotel_FortWorth.csv', header = None)
df = data[2]
for url in df[1:]:
print(url)
thepage = requests.get(url).text
soup = BeautifulSoup(thepage, "html.parser")
resultsoup = soup.find_all("p", {"class": "partial_entry"})
file.extend(resultsoup)
with open('hotelreview.csv', 'w', newline='') as fid:
for review in file:
review_list = review.get_text()
fid.write(unidecode(review_list+'\n'))
Expected result:
name link review
1 ... ... ...
2
....

You can pandas to create the new CSV.
Ex:
import requests
from unidecode import unidecode
from bs4 import BeautifulSoup
import pandas as pd
data = pd.read_csv('hotel_FortWorth.csv')
review = []
for url in data["link"]:
print(url)
thepage = requests.get(url).text
soup = BeautifulSoup(thepage, "html.parser")
resultsoup = soup.find_all("p", {"class": "partial_entry"})
review.append(unidecode(resultsoup))
data["review"] = review
data.to_csv('hotelreview.csv')

Related

Web scraping in python; Output to excel returns HTML instead of the data frame

I'm new to python and practicing web scraping. I was writing a code to get all the movie names and the the consecutive year of the movies. I got the result into a dataframe, but when i am exporting to excel, its showing as the html code. Thanks in advance.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os
pages=np.arange(1,2,1)
pwd = os.getcwd()
yify = pd.DataFrame (columns = ['Title', 'Year'])
#looping through different pages
for page in pages:
page='https://yts.mx/browse-movies?page='+str(page)
data=requests.get(page).text
soup = BeautifulSoup(data,'html.parser')
#looping through all the movie names and years
for row in soup.find_all('div', class_ = 'browse-movie-bottom'):
title=row.find('a', class_ = 'browse-movie-title')
years=row.find('div', class_ = 'browse-movie-year')
yify=yify.append({'Title': title, 'Year':years},ignore_index=True)
yify.head()
output.to_excel(pwd + '\\Yify_Test_Output.xlsx', index=False )
Try:
for row in soup.find_all('div', class_ = 'browse-movie-bottom'):
title=row.find('a', class_ = 'browse-movie-title').text.strip()
years=row.find('div', class_ = 'browse-movie-year').text.strip()
As a short explanation: the OP was retrieving the actual HTML element, including HTML tags. You extract the text from the element with <html_element>.text.

I cannot separate scraped data into different category

I need help from python expert. This is a site where I have to scrape table data and separate into four different category then convert it into excel file but problem is all table category's classes are same.
There should be different four classes but same four classes
Thanks
Mariful
Website for scrape
import requests
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd
url = "https://www.kpaa.or.kr/kpaa/eng/list.do?"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
items = soup.find_all(class_='title')
for item in items:
n = item.text
print(n)
df = pd.Dataframe({'name':n, 'office':n, 'phone':n, 'email':n})
Here is i try to convert single data to 2D list to use in data pandas data frame.
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd
url = "https://www.kpaa.or.kr/kpaa/eng/list.do?"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
data_list = [td.getText(strip=True, separator=',').split(',') for td in soup.find('div', {'class':'cont_box2'}).find_all('tr')[:-1]]
df = pd.DataFrame(data_list)
df.to_excel('x.xlsx')

Empty Dataframe when scraping specific column from website

I wanted to try to scrape some specific columns (Company details column) in the CNBC Nasdaq 100 website specifically the Adobe stocks, below is the snippet of my code
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
def get_company_info(url):
original_url = url
key = {}
l = []
page_response = requests.get(url, timeout=240)
page_content = BeautifulSoup(page_response.content, "html.parser")
name = page_content.find('div',{"class":"quote-section-header large-header"}).find("span",{"class":"symbol"}).text
description = page_content.find_all('div',{"class":"moduleBox"})
for items in description:
for i in range(len(items.find_all("tr"))-1):
# Gather data
key["stock_desc"] = items.find_all("td", {"class":"desc"})[i].find('div',attrs={'id':'descLong'}).text
shares = items.find_all("td").find("table",attrs={"id":"shares"})
for rest_of_items in shares:
for i in range(len(items.find_all("tr"))-1):
key["stock_outstanding-shares"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_ownership"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_market_cap"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_lastSplit"] = items.find_all("td", {"class":"bold aRit"})[i].text
# Print ("")
l.append(key)
key['name'] = name
df = pd.DataFrame(l)
print(df)
return key, df
get_company_info("https://www.cnbc.com/quotes/?symbol=ADBE&tab=profile")
So, I'm keen to get the result in dataframe so that I can change to CSV file, but my code keep showing empty dataframe result, Below are the error shown
The result I wanted is something like this
The information you are looking for is not available in the url you requested. This is because the information is fetched by the page using a JavaScript. Which in turn requests a different URL which provides the data.
Example code
from bs4 import BeautifulSoup
import requests
page=requests.get("https://apps.cnbc.com/view.asp?symbol=ADBE.O&uid=stocks/summary")
soup = BeautifulSoup(page.content, 'html.parser')
Name=soup.find("h5",id="companyName").text
stock_desc= soup.find("div",id="descLong").text
table=soup.find("table",id="shares")
details=table.find_all("td", class_="bold aRit")
stock_outstanding_shares= details[0].text
stock_ownership= details[1].text
stock_market_cap= details[2].text
stock_lastSplit= details[3].text
You can create dataframe and export to csv.

How do I use BeautifulSoup4 to get ALL text before <br> tag into a pandas data frame

I'm trying to extract postcode and suburbs of sydney as dataframe by scraping a website
I've located the corresponding tag in beautifulsup but am not able to finish the process
import requests
from bs4 import BeautifulSoup
url = 'https://data.mongabay.com/igapo/australia/postcodes/sydney-numeric.html'
res = requests.get(url)
soup = BeautifulSoup(res.text,'html5lib')
table = soup.find_all('table')[10]
rows = table.find_all('td')[2]
for br in rows.find_all("br"):
br.replace_with("\n")
parsedText = rows.get_text()
I'm expecting a dataframe such as:
postcode suburbs
2000 Australia Square Post Office
2000 Circular Quay
2000 Clarence Street Post Office
...
thank you for your help
A somewhat verbose method. bs4 4.7.1
import requests
import pandas as pd
​
url = 'https://data.mongabay.com/igapo/australia/postcodes/sydney-numeric.html'
r = requests.get(url, headers = {'User-Agent' : 'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
codes = []
names = []
​
for line in soup.select_one('h1:contains("Postal codes") ~ font [size="2"]').text.split('\n'):
if line[:4]:
codes.append(line[:4]), names.append(line[5:])
df = pd.DataFrame(list(zip(codes, names)), columns = ['code', 'name'])
​
You did pretty much all the work! You just have to read it in correctly.
from io import StringIO
import re
pd.read_csv(StringIO(parsedText), sep= r'(?<=\d) ', header = None, engine = 'python')
So we need to use StringIO to make your text readable by pd.read_csv, then we can do some regex to define our separator.
r'(?<=\d) ' looks for a space (notice the space after ')' ) if there is a digit preceding it.

Creating Pandas Dataframe from WebScraping Results

I am trying to scrape a table from espn and send the data to a pandas dataframe in order to export it to excel. I have completed most of the scraping, but am getting stuck on how to send each 'td' tag to a unique dataframe cell within my for loop. (Code is below) Any thoughts? Thanks!
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import os
import csv
import pandas as pd
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
soup = make_soup("http://www.espn.com/nba/statistics/player/_/stat/scoring-
per-game/sort/avgPoints/qualified/false")
regex = re.compile("^[e-o]")
for record in soup.findAll('tr', {"class":regex}):
for data in record.findAll('td'):
print(data)
I was actually recently scraping sports websites working on a daily fantasy sports algorithm for a class. This is the script I wrote up. Perhaps this approach can work for you. Build a dictionary. Convert it to a dataframe.
url = http://www.footballdb.com/stats/stats.html?lg=NFL&yr={0}&type=reg&mode={1}&limit=all
result = requests.get(url)
c = result.content
# Set as Beautiful Soup Object
soup = BeautifulSoup(c)
# Go to the section of interest
tables = soup.find("table",{'class':'statistics'})
data = {}
headers = {}
for i, header in enumerate(tables.findAll('th')):
data[i] = {}
headers[i] = str(header.get_text())
table = tables.find('tbody')
for r, row in enumerate(table.select('tr')):
for i, cell in enumerate(row.select('td')):
try:
data[i][r] = str(cell.get_text())
except:
stat = strip_non_ascii(cell.get_text())
data[i][r] = stat
for i, name in enumerate(tables.select('tbody .left .hidden-xs a')):
data[0][i] = str(name.get_text())
df = pd.DataFrame(data=data)

Categories

Resources