How scrape comment by soup - python

I would like scrape comments from this site https://www.ceneo.pl/sklepy/morele.net-s379
But after scrpaed i got empty file. What i did wrong ?
This is my code
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.ceneo.pl/sklepy/morele.net-s379")
soup = BeautifulSoup(page.content, "html.parser",
).find_all("div", class_="js_shop-reviews js_shop reviews-offer")
morele = [[ i.getText(strip=True) for i in div.find("div") if i.getText()] for div in soup]
csv_table = pd.DataFrame(morele)
csv_table = csv_table.reset_index(drop=True)
csv_table.insert(0,'No.',csv_table.index)
#print(csv_table)
#Export to Csv file
csv_table.to_csv(r'C:/Users/admin/Desktop/morele.csv',";",encoding='utf-8-sig',index = False, header=True)

try this
I found the comment was under class user-post__text so changed it.
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.ceneo.pl/sklepy/morele.net-s379")
soup = BeautifulSoup(page.content, "html.parser",
).find_all("div", {"class":"user-post__text"}) #changes made here
# print(soup)
morele = [div.getText(strip=True) for div in soup] #and here as well
print(morele)
csv_table = pd.DataFrame(morele)
csv_table = csv_table.reset_index(drop=True)
csv_table.insert(0,'No.',csv_table.index)
#print(csv_table)
#Export to Csv file
csv_table.to_csv(r'morele.csv',";",encoding='utf-8-sig',index = False, header=True)
Does this solves your problem?

Related

How to get data from website into Pandas dataframe

I need to write a Python script that will use data from a website like this link and put it into a pandas dataframe.
My try is
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
url = "https://www.w3schools.com/sql/trysql.asp?filename=trysql_op_in"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
league_table = soup.find('table', class_ = "table.ws-table-all.notranslate")
print(league_table)
And I got output: None

How to scrape review to dataframe

I would like to scratch the reviews from this page and save them as a data frame, but I do not download star ratings and the text of the review. Just only text. What i did wrong?
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.morele.net/pralka-candy-cs4-1062d3-950636/?sekcja=reviews-all")
soup = BeautifulSoup(page.content, "html.parser",
).find_all("div", {"class":"reviews-item"})
# print(soup)
morele = [div.getText(strip=True) for div in soup]
print(morele)
csv_table = pd.DataFrame(morele)
csv_table = csv_table.reset_index(drop=True)
csv_table.insert(0,'No.',csv_table.index)
You are mostly there - just further navigate the DOM and you can get just the text.
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.morele.net/pralka-candy-cs4-1062d3-950636/?sekcja=reviews-all")
soup = BeautifulSoup(page.content, "html.parser",)
data = [{"text":ri.find("div", {"class":"rev-desc"}).getText(strip=True) ,
"stars":ri.find("div", {"class":"rev-stars"}).getText(strip=True)}
for ri in soup.find_all("div", {"class":"reviews-item"})
]
pd.DataFrame(data)

Python html parsing using beautiful soup issues

I am trying to get the name of all organizations from https://www.devex.com/organizations/search using beautifulsoup.However, I am getting an error. Can someone please help.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
headers = {"Accept-Language": "en-US,en;q=0.5"}
titles = []
pages = np.arange(1, 2, 1)
for page in pages:
page = requests.get("https://www.devex.com/organizations/search?page%5Bnumber%5D=" + str(page) + "", headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
movie_div = soup.find_all('div', class_='info-container')
sleep(randint(2,10))
for container in movie_div:
name = container.a.find('h3', class_= 'ng-binding').text
titles.append(name)
movies = pd.DataFrame({
'movie': titles,
})
to see your dataframe
print(movies)
to see the datatypes of your columns
print(movies.dtypes)
to see where you're missing data and how much data is missing
print(movies.isnull().sum())
to move all your scraped data to a CSV file
movies.to_csv('movies.csv')
you may try with something like
name = bs.find("h3", {"class": "ng-binding"})

How to specify table for BeautifulSoup to find?

I'm trying to grab the table on this page https://nces.ed.gov/collegenavigator/?id=139755 under the Net Price expandable object. I've gone through tutorials for BS4, but I get so confused by the complexity of the html in this case that I can't figure out what syntax and which tags to use.
Here's a screenshot of the table and html I'm trying to get:
This is what I have so far. How do I add other tags to narrow down the results to just that one table?
import requests
from bs4 import BeautifulSoup
page = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = BeautifulSoup(page.text, 'html.parser')
soup = soup.find(id="divctl00_cphCollegeNavBody_ucInstitutionMain_ctl02")
print(soup.prettify())
Once I can parse that data, I will format into a dataframe with pandas.
In this case I'd probably just use pandas to retrieve all tables then index in for appropriate
import pandas as pd
table = pd.read_html('https://nces.ed.gov/collegenavigator/?id=139755')[10]
print(table)
If you are worried about future ordering you could loop the tables returned by read_html and test for presence of a unique string to identify table or use bs4 functionality of :has , :contains (bs4 4.7.1+) to identify the right table to then pass to read_html or continue handling with bs4
import pandas as pd
from bs4 import BeautifulSoup as bs
r = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = bs(r.content, 'lxml')
table = pd.read_html(str(soup.select_one('table:has(td:contains("Average net price"))')))
print(table)
ok , maybe this can help you , I add pandas
import requests
from bs4 import BeautifulSoup
import pandas as pd
page = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = BeautifulSoup(page.text, 'html.parser')
div = soup.find("div", {"id": "divctl00_cphCollegeNavBody_ucInstitutionMain_ctl02"})
table = div.findAll("table", {"class": "tabular"})[1]
l = []
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
if td:
row = [i.text for i in td]
l.append(row)
df=pd.DataFrame(l, columns=["AVERAGE NET PRICE BY INCOME","2015-2016","2016-2017","2017-2018"])
print(df)
Here is a basic script to scrape that first table in that accordion:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://nces.ed.gov/collegenavigator/?id=139755#netprc"
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
parent_table = soup.find('div', attrs={'id':'netprc'})
desired_table = parent_table.find('table')
print(desired_table.prettify())
I assume you only want the values within the table so I did an overkill version of this as well that will combine the column names and values together:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://nces.ed.gov/collegenavigator/?id=139755#netprc"
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
parent_table = soup.find('div', attrs={'id':'netprc'})
desired_table = parent_table.find('table')
header_row = desired_table.find_all('th')
headers = []
for header in header_row:
header_text = header.get_text()
headers.append(header_text)
money_values = []
data_row =desired_table.find_all('td')
for rows in data_row:
row_text = rows.get_text()
money_values.append(row_text)
for yrs,money in zip(headers,money_values):
print(yrs,money)
This will print out the following:
Average net price
2015-2016 $13,340
2016-2017 $15,873
2017-2018 $16,950

How to get the data from <script> with no attributes inside <body>?

I am trying to extract all coordinates of locations of the restaurant using beautiful soup.How can i extract all of them from the script tag under the body?
from bs4 import BeautifulSoup as bs
import requests
import urllib2
import json
base_url = 'https://locations.wafflehouse.com/'
r = requests.get(base_url)
soup = bs(r.text, 'html.parser')
all_scripts = soup.find_all('script')
print all_scripts[19]
UPDATED ANSWER:
You need to parse the json in json.loads() and then navigate, try this code...working smoothly!
import json, requests
from bs4 import BeautifulSoup
req = requests.get('https://locations.wafflehouse.com/')
soup = BeautifulSoup(req.content, 'html.parser')
data = soup.find_all('script')[19].text.encode('utf-8')
jdata = data.replace('window.__SLS_REDUX_STATE__ =', '').replace(';', '')
data = json.loads(jdata)
for i in data['dataLocations']['collection']['features']:
LatLong = (i['geometry']['coordinates'])
print(LatLong)
Output:
[-90.073113, 30.37019]
[-84.131085, 33.952944]
[-78.719497, 36.14261]
[-95.629084, 29.947421]
[-83.9019, 33.56531]
[-80.091552, 37.288422]
[-77.949231, 34.237534]
[-96.60637, 32.968131]
[-80.969088, 29.151235]
[-86.843386, 33.354666]
[-84.206, 33.462175]
[-76.342464, 36.830187]
[-79.985822, 32.898412]
[-84.2784568595722, 33.
[-88.780694, 35.674914]
[-87.898899, 30.598605]
[-83.71487, 32.614092]
[-79.523611, 36.07101]
[-91.127792, 30.580582]
[-86.352681, 35.875097]
[-90.271372, 30.023002]
[-80.205641, 25.955672]
[-81.632, 30.157]
[-86.961821, 31.454352]
[-80.666906, 35.366769]
[-97.56596, 35.406447]
[-84.364334, 35.511474]
[-81.01622, 29.23453]
[-86.57177, 34.855504]
[-84.625908, 33.399829]
[-76.344303, 36.740862]
[-84.192634, 33.517948]
[-77.83421, 39.296024]
[-77.518985, 38.359332]
[-84.45238, 38.042061]
[-83.08319, 39.840191]
[-81.993971, 33.475816]
[-95.481102, 29.913294]
[-82.699, 28.334]
[-84.352035, 33.989889]
[-86.819468, 35.945115]
[-91.009638, 30.407864]
[-81.8428, 27.9096]

Categories

Resources