I'd like to scrape airbnb's listings by city (for the 5 cities listed in the code) and would like to gather information such as: price, a link to the listing, room type, # of guests, etc.
I was able to get the link, but I'm having trouble getting the price.
from bs4 import BeautifulSoup
import requests
import csv
from urllib.parse import urljoin # For joining next page url with base url
from datetime import datetime # For inserting the current date and time
start_url_nyc = "https://www.airbnb.com/s/New-York--NY--United-States"
start_url_mia = "https://www.airbnb.com/s/Miami--FL--United-States"
start_url_la = "https://www.airbnb.com/s/Los_Angeles--CA--United-States"
start_url_sf = "https://www.airbnb.com/s/San_Francisco--CA--United-States"
start_url_orl = "https://www.airbnb.com/s/Orlando--FL--United-States"
def scrape_airbnb(url):
# Set up the URL Request
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
# Iterate over search results
for search_result in soup.find_all('div', 'infoContainer_tfq3vd'):
# Parse the name and price and record the time
link_end = search_result.find('a').get('href')
link = "https://www.airbnb.com" + link_end
price = search_result.find('span', 'data-pricerate').find('data-reactid').get(int)
return (price)
print(scrape_airbnb(start_url_orl))
This is the html code:
<span data-pricerate="true" data-reactid=".91165im9kw.0.2.0.3.2.1.0.$0.$grid_0.$0/=1$=01$16085565.$=1$16085565.0.2.0.1.0.0.0.1:1">552</span>
This is your code
price = search_result.find('span', 'data-pricerate').find('data-reactid').get(int)
first:
Some attributes, like the data-* attributes in HTML 5, have names that can’t be used as the names of keyword arguments:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(data-foo="value")
# SyntaxError: keyword can't be an expression
You can use these attributes in searches by putting them into a
dictionary and passing the dictionary into find_all() as the attrs
argument:
data_soup.find_all(attrs={"data-foo": "value"})
# [<div data-foo="value">foo!</div>]
than:
price = search_result.find('span', attrs={"data-pricerate":"true"})
this will return a span tag which contains price as string, just use .text
price = search_result.find('span', attrs={"data-pricerate":"true"}).text
Related
I have the following code that goes to an ebay url and extracts the name of a listing and sale price, see below:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import request
html = urlopen('https://www.ebay.com/sch/i.html?_from=R40&_nkw=manga&_sacat=0&rt=nc&LH_Sold=1&LH_Complete=1')
soup = BeautifulSoup(html.read(), 'html.parser')
soldItem = soup.find('h3', class_='s-item__title s-item__title--has-tags')
salePrice = soup.find('span', class_='POSITIVE')
#data = soup.find('div', class_='s-item__info clearfix')
itemData = {soldItem.get_text():salePrice.get_text()}
I want to create a for loop that iterates over the first page and gives me the name and sale price of every listing.
However, every single attempt that I've made returns either the same listing five times or all sold items and thereafter all sale prices.
Any hints as to how to format my for loop ?
You can try this code that will create all items and their prices in a dictionary by the key of the item name and the value of the price.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
html = urlopen('https://www.ebay.com/sch/i.html?_from=R40&_nkw=manga&_sacat=0&rt=nc&LH_Sold=1&LH_Complete=1')
soup = BeautifulSoup(html.read(), 'html.parser')
soldItem = soup.find_all('h3', class_='s-item__title s-item__title--has-tags')
salePrice = soup.find_all('span', class_='POSITIVE')
itemsData = {item.text: price.text for item, price in zip(soldItem, salePrice)}
print(itemsData)
I wanted to try to scrape some specific columns (Company details column) in the CNBC Nasdaq 100 website specifically the Adobe stocks, below is the snippet of my code
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
def get_company_info(url):
original_url = url
key = {}
l = []
page_response = requests.get(url, timeout=240)
page_content = BeautifulSoup(page_response.content, "html.parser")
name = page_content.find('div',{"class":"quote-section-header large-header"}).find("span",{"class":"symbol"}).text
description = page_content.find_all('div',{"class":"moduleBox"})
for items in description:
for i in range(len(items.find_all("tr"))-1):
# Gather data
key["stock_desc"] = items.find_all("td", {"class":"desc"})[i].find('div',attrs={'id':'descLong'}).text
shares = items.find_all("td").find("table",attrs={"id":"shares"})
for rest_of_items in shares:
for i in range(len(items.find_all("tr"))-1):
key["stock_outstanding-shares"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_ownership"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_market_cap"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_lastSplit"] = items.find_all("td", {"class":"bold aRit"})[i].text
# Print ("")
l.append(key)
key['name'] = name
df = pd.DataFrame(l)
print(df)
return key, df
get_company_info("https://www.cnbc.com/quotes/?symbol=ADBE&tab=profile")
So, I'm keen to get the result in dataframe so that I can change to CSV file, but my code keep showing empty dataframe result, Below are the error shown
The result I wanted is something like this
The information you are looking for is not available in the url you requested. This is because the information is fetched by the page using a JavaScript. Which in turn requests a different URL which provides the data.
Example code
from bs4 import BeautifulSoup
import requests
page=requests.get("https://apps.cnbc.com/view.asp?symbol=ADBE.O&uid=stocks/summary")
soup = BeautifulSoup(page.content, 'html.parser')
Name=soup.find("h5",id="companyName").text
stock_desc= soup.find("div",id="descLong").text
table=soup.find("table",id="shares")
details=table.find_all("td", class_="bold aRit")
stock_outstanding_shares= details[0].text
stock_ownership= details[1].text
stock_market_cap= details[2].text
stock_lastSplit= details[3].text
You can create dataframe and export to csv.
I am trying to extract the data from drop down using python.
link:
From that,
State
Category
District
CTSO
Division
Map Type
mentions so, I want to extract data from all by selecting one by one using beautifulsoup or request library.
In my code, I gave district name statically but I want to select it one by one and extract data from it.
I tried but it doesn't work
`
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
list = ["Akola", "Amravati", "Buldana", "yavatmal", "washim"]
url = "http://igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName="
for lists in list:
urls= url+lists
# print(urls)
response = requests.get(urls)
# print(response)
soup = BeautifulSoup(response.text, "html.parser")
# print(soup)
# soups= soup.find_all("div", {"id": "level_text_2"})
# print(soups)
# for ids in soup.find_all(attrs={'id': 'location_table'}):
# print(ids)
# ids = ids.text.strip()
# print(ids)
for option in soup.find_all('option'):
print(option.text)
for tag in soup.find_all(class_="panel-body"):
# print(tag.get('ctl00_ContentPlaceHolder5_ddlDistrict'))
print(tag)
`
I want:
District name
All taluka names
and all villages name. like that
Something like this
import requests
from bs4 import BeautifulSoup
for dist_name in ["Akola", "Amravati", "Buldana", "yavatmal", "washim"]:
print('-- {} --'.format(dist_name))
r = requests.get('http://igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName={}'.format(dist_name))
if r.status_code == 200:
soup = BeautifulSoup(r.text, "html.parser")
select_list = soup.find_all('select')
for select in select_list:
print('Select name: ' + select.attrs['name'])
option_list = select.find_all('option')
for option in option_list:
print('\t option: ' + option.attrs['value'])
I've written a script in python to scrape the tablular content from a webpage. In the first column of the main table there are the names. Some names have links to lead another page, some are just the names without any link. My intention is to parse the rows when a name has no link to another page. However, when the name has link to another page then the script will first parse the concerning rows from the main table and then follow that link to parse associated information of that name from the table located at the bottom under the title Companies. Finally, write them in a csv file.
site link
I've tried so far:
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
link = "https://suite.endole.co.uk/insight/company/ajax_people.php?ajax_url=ajax_people&page=1&company_number=03512889"
base = "https://suite.endole.co.uk"
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select("table tr")[1:]:
if not item.select_one("td a[href]"):
first_table = [i.text for i in item.select("td")]
print(first_table)
else:
first_table = [i.text for i in item.select("td")]
print(first_table)
url = urljoin(base,item.select_one("td a[href]").get("href"))
resp = requests.get(url)
soup_ano = BeautifulSoup(resp.text,"lxml")
for elems in soup_ano.select(".content:contains(Companies) table tr")[1:]:
associated_info = [elem.text for elem in elems.select("td")]
print(associated_info)
My above script can do almost everything but I can't create any logic to print once rather than printing thrice to get all the data atltogether so that I can write them in a csv file.
Put all your scraped data into a list, here I've called the list associated_info then all the data is in one place & you can iterate over the list to print it out to a CSV if you like...
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
link = "https://suite.endole.co.uk/insight/company/ajax_people.php?ajax_url=ajax_people&page=1&company_number=03512889"
base = "https://suite.endole.co.uk"
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
associated_info = []
for item in soup.select("table tr")[1:]:
if not item.select_one("td a[href]"):
associated_info.append([i.text for i in item.select("td")])
else:
associated_info.append([i.text for i in item.select("td")])
url = urljoin(base,item.select_one("td a[href]").get("href"))
resp = requests.get(url)
soup_ano = BeautifulSoup(resp.text,"lxml")
for elems in soup_ano.select(".content:contains(Companies) table tr")[1:]:
associated_info.append([elem.text for elem in elems.select("td")])
print(associated_info)
I'm trying to get and print the current weather temperature and city name from a local website, but no success.
All I need it to read and print the city (Lodrina), the Temperature (23.1C) and if possible the title in ca-cond-firs ("Temperatura em declínio") - this last one changes as temps goes up or down...
This is the html section of the site:
THIS IS THE HTML (the part of matters:)
#<div class="ca-cidade">Londrina</div>
<ul class="ca-condicoes">
<li class="ca-cond-firs"><img src="/site/imagens/icones_condicoes/temperatura/temp_baixa.png" title="Temperatura em declínio"/><br/>23.1°C</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/vento/L.png"/><br/>10 km/h</li>
<li class="ca-cond"><div class="ur">UR</div><br/>54%</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/chuva.png"/><br/>0.0 mm</li>
THIS IS THE CODE I DID SO FAR:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
id = soup.find('a', 'id=23185109')
print(id)
any help?
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser') # parse page as html
temp_table = soup.find_all('table', {'class':'cidadeTempo'}) # get detail of table with class name cidadeTempo
for entity in temp_table:
city_name = entity.find('h3').text # fetches name of city
city_temp_max = entity.find('span', {'class':'tempMax'}).text # fetches max temperature
city_temp_min = entity.find('span', {'class':'tempMin'}).text # fetches min temperature
print("City :{} \t Max_temp: {} \t Min_temp: {}".format(city_name, city_temp_max, city_temp_min)) # prints content
below code can get details of temprature at right side of page as you require.
result_table = soup.find('div', {'class':'ca-content-wrapper'})
print(result_table.text) # in your case there is no other div exist with class name ca-content-wrapper hence I can use it directly without iterating. you can use if condition to control which city temprature to print and which to not.
# output will be like :
# Apucarana
# 21.5°C
# 4 km/h
# UR60%
# 0.0 mm
I'm not sure what problems you are running into with your code. In my attempts to use your code, I found that I needed to use the html parser to successfully parse the website. I also used soup.findAll() in order to find elements that matched the desired class. Hopefully the below will lead you to your answer:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
rows = soup.findAll('li', {'class', 'ca-cond-firs'})
print rows
You should try out the CSS3 selectors in BS4, I personally find it a lot easier to use than find and find_all.
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
# soup.select returns the list of all the elements that matches the CSS3 selector
# get the text inside each <a> tag inside div.ca-cidade
cities = [cityTag.text for cityTag in soup.select("div.ca-cidade > a")]
# get the temperature inside each li.ca-cond-firs
temps = [tempTag.text for tempTag in soup.select("li.ca-cond-firs")]
# get the temperature status inside each li.ca-cond-firs > img title attibute
tempStatus = [tag["title"] for tag in soup.select("li.ca-cond-firs > img")]
# len(cities) == len(temps) == len(tempStatus) => This is normally true.
for i in range(len(cities)):
print("City: {}, Temperature: {}, Status: {}.".format(cities[i], temps[i], tempStatus[i]))
Here you go. You can customize that wind thing depending on icon name.
#!/usr/bin/env python
# -*- encoding: utf8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from bs4 import BeautifulSoup
import requests
def get_weather_data():
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
cities = soup.find('div', {"class":"ca-content-wrapper"})
weather_data = []
for city in cities.findAll("div", {"class":"ca-bg"}):
name = city.find("div", {"class":"ca-cidade"}).text
temp = city.find("li", {"class":"ca-cond-firs"}).text
conditons = city.findAll("li", {"class":"ca-cond"})
weather_data.append({
"city":name,
"temp":temp,
"conditions":[{
"wind":conditons[0].text +" "+what_wind(conditons[0].find("img")["src"]),
"humidity":conditons[1].text,
"raind":conditons[2].text,
}]
})
return weather_data
def what_wind(img):
if img.find ("NE"):
return "From North East"
if img.find ("O"):
return "From West"
if img.find ("N"):
return "From North"
#you can add other icons here
print get_weather_data()
And that is all weather data from that website.