I am trying to scrape data on car model, price, mileage, location, etc using beautifulsoup. However, the return result only reports data on one random car. I want to be able to collect data on all cars advertised on the site to date. My python code is below. How can I modify my code to retrieve data such that each day I have information on car model, price, mileage, location, etc? Example:
Car model price mileage location date
Toyota Corrola $4500 22km Accra 16/02/2018
Nissan Almera $9500 60km Tema 16/02/2018
etc
import requests
from bs4 import BeautifulSoup
import pandas
import csv
from datetime import datetime
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
print soup.prettify()
data = soup.find(class_='item-content')
for tag in data:
item_title = data.find("a",attrs={"class":"item-title h4"})
model = item_title.text.encode('utf-8').strip()
item_meta = data.find("p",attrs={"class":"item-meta"})
mileage = item_meta.text.encode('utf-8').strip()
item_location = data.find("p",attrs={"class":"item-location"})
location = item_location.text.encode('utf-8').strip()
item_info = data.find("p",attrs={"class":"item-info"})
price = item_info.text.encode('utf-8').strip()
with open('example.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([model, price, mileage, location, datetime.now()])
First off, this loop:
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
is not doing what I assume you think it is. This loop simply resets the url 300 times and leaves you with the original url you set. You need to wrap all your code in this loop to ensure you are hitting each of the URLs you want (1-300).
Restructure your code (paying attention to indents!) so that the next url is the one being used in the request:
# This will print ALOT of titles
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?" + str(i)
print(url) # Notice how the url changes with each iteration?
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
titles = soup.findAll("a",attrs={"class":"item-title h4"})
for item in titles:
currTitle = item.text.encode('utf-8').strip()
print(currTitle)
This code:
import requests
from bs4 import BeautifulSoup as bsoup
url = "https://tonaton.com/en/ads/ghana/cars?1"
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
titles = soup.findAll("a",attrs={"class":"item-title h4"})
for item in titles:
print(item.text.encode('utf-8').strip())
Yields (not sure what the 'b' is doing):
b'Hyundai Veloster 2013'
b'Ford Edge 2009'
b'Mercedes-Benz C300 2016'
b'Mazda Demio 2007'
b'Hyundai Santa fe 2005'
# And so on...
The problem is that 1) if you call find(), it will stop after you find the first match given your params. Using findAll() will dump all matches into a list which you then can iterate through and process as needed. And 2) the result you get from a call to find() is a broken structure of the original HTML. Thus the next find() calls won't work.
import requests
from bs4 import BeautifulSoup as bsoup
import csv
from datetime import datetime
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
item_title = soup.findAll("a",attrs={"class":"item-title h4"})
for item in item_title:
model = item.text.encode('utf-8').strip()
item_meta = soup.findAll("p",attrs={"class":"item-meta"})
for item in item_meta:
milleage = item.text.encode('utf-8').strip()
item_location = soup.findAll("p",attrs={"class":"item-location"})
for item in item_location:
location = item.text.encode('utf-8').strip()
item_info = soup.findAll("p",attrs={"class":"item-info"})
for item in item_info:
price = item.text.encode('utf-8').strip()
with open('index.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([model, price, milleage, location, datetime.now()])
Related
I am trying to scrape the prices from a website and it's working but... I can't write the result to a text.file.
this is my python code.
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.futbin.com/stc/cheapest"
r = requests.get(url)
soup = bs(r.content, "html.parser")
price = soup.find("div", {"class":"d-flex row col-md-9 px-0"})
name =("example")
f =open(name + '.txt', "a")
f.write(price.text)
This is not working but if I print it instead of try to write it to a textfile it's working. I have searched for a long time but don't understand it. I think it must be a string to write to a text file but don't know how to change the ouput to a string.
You're getting error due to unicode character.
Try to add encoding='utf-8' property while opening a file.
Also your code gives a bit messy output. Try this instead:
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.futbin.com/stc/cheapest"
r = requests.get(url)
soup = bs(r.content, "html.parser")
rows = soup.find("div", {"class":"d-flex row col-md-9 px-0"})
prices = rows.findAll("span",{"class":"price-holder-row"})
names = rows.findAll("div",{"class":"name-holder"})
price_list = []
name_list = []
for price in prices:
price_list.append(price.text.strip("\n "))
for name in names:
name_list.append(name.text.split()[0])
name =("example")
with open(f"{name}.txt",mode='w', encoding='utf-8') as f:
for name, price in zip(name_list,price_list):
f.write(f"{name}:{price}\n")
I am trying to extract the data from drop down using python.
link:
From that,
State
Category
District
CTSO
Division
Map Type
mentions so, I want to extract data from all by selecting one by one using beautifulsoup or request library.
In my code, I gave district name statically but I want to select it one by one and extract data from it.
I tried but it doesn't work
`
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
list = ["Akola", "Amravati", "Buldana", "yavatmal", "washim"]
url = "http://igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName="
for lists in list:
urls= url+lists
# print(urls)
response = requests.get(urls)
# print(response)
soup = BeautifulSoup(response.text, "html.parser")
# print(soup)
# soups= soup.find_all("div", {"id": "level_text_2"})
# print(soups)
# for ids in soup.find_all(attrs={'id': 'location_table'}):
# print(ids)
# ids = ids.text.strip()
# print(ids)
for option in soup.find_all('option'):
print(option.text)
for tag in soup.find_all(class_="panel-body"):
# print(tag.get('ctl00_ContentPlaceHolder5_ddlDistrict'))
print(tag)
`
I want:
District name
All taluka names
and all villages name. like that
Something like this
import requests
from bs4 import BeautifulSoup
for dist_name in ["Akola", "Amravati", "Buldana", "yavatmal", "washim"]:
print('-- {} --'.format(dist_name))
r = requests.get('http://igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName={}'.format(dist_name))
if r.status_code == 200:
soup = BeautifulSoup(r.text, "html.parser")
select_list = soup.find_all('select')
for select in select_list:
print('Select name: ' + select.attrs['name'])
option_list = select.find_all('option')
for option in option_list:
print('\t option: ' + option.attrs['value'])
I'm trying to scrape the number of likes for the datasets available on this website.
I've been unable to workout a way of reliably identifying and scraping the relationship between the dataset title and the like integer:
as it is embedded in the HTML as below:
I have used a scraper previously to get information about the resource urls. In that case I was able to capture the last child a of parent h3 with a parent having class .dataset-item.
I would like to adapt my existing code to scrape the number of likes for each resource in the catalogue, rather than the URLs. Below is the code for the url scraper I used:
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
for page in range(1,2): #set number of pages
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)
My desired output would appear like this:
The approach is pretty straight forward. Your given website contains required elements in a list Tag. And what you need to do, is to get source code of that <li> tag, and just fetch Heading, which has a certain class and Same goes for like count.
The catch in like count is, the text comprises of some noise. To fix that, you can use regular expression to extract digits ('\d+') from given input of likes count. Following code gives desired result:
from bs4 import BeautifulSoup as soup
import requests
import re
import pandas as pd
source = requests.get('https://data.nsw.gov.au/data/dataset')
sp = soup(source.text,'lxml')
element = sp.find_all('li',{'class':"dataset-item"})
heading = []
likeList = []
for i in element:
try:
header = i.find('a',{'class':"searchpartnership-url-analytics"})
heading.append(header.text)
except:
header = i.find('a')
heading.append(header.text)
like = i.find('span',{'id':'likes-count'})
likeList.append(re.findall('\d+',like.text)[0])
dict = {'Title': heading, 'Likes': likeList}
df = pd.DataFrame(dict,index=False)
print(df)
Hope it helped!
You could use the following.
I am using a css selector with Or syntax to retrieve title and likes as one list (as every publication has both). I then use slicing to separate titles from likes.
from bs4 import BeautifulSoup as bs
import requests
import csv
def get_titles_and_likes(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
info = [item.text.strip() for item in soup.select(css_selector)]
titles = info[::2]
likes = info[1::2]
return list(zip(titles,likes))
results = []
with requests.Session() as s:
for page in range(1,10): #set number of pages
data = get_titles_and_likes(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-heading .searchpartnership-url-analytics, .dataset-heading [href*="/data/dataset"], .dataset-item #likes-count')
results.append(data)
results = [i for item in results for i in item]
with open(r'data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Likes'])
for row in results:
w.writerow(row)
I am a beginner in Web-scraping and I am following this tutorial to extract movie data from this link, I chose to extract movies between 2016 and 2019 for the test. I get just 25 lines but I want more than 30000.
Do you think it's possible ?
this is the code :
from requests import get
from bs4 import BeautifulSoup
import csv
import pandas as pd
from time import sleep
from random import randint
from time import time
from IPython.core.display import clear_output
headers = {"Accept-Language": "en-US, en;q=0.5"}
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2018)]
url = 'https://www.imdb.com/search/title?release_date=2016-01-01,2019-05-01'
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
start_time = time()
requests = 0
for year_url in years_url:
# For every page in the interval 1-4
for page in pages:
# Make a get request
response = get('http://www.imdb.com/search/title?release_date=' + year_url +'&sort=num_votes,desc&page=' + page, headers = headers)
# Pause the loop
sleep(randint(8,15))
# Monitor the requests
requests += 1
elapsed_time = time() - start_time
print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
# Throw a warning for non-200 status codes
if response.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, response.status_code))
# Break the loop if the number of requests is greater than expected
if requests > 72:
warn('Number of requests was greater than expected.')
# Parse the content of the request with BeautifulSoup
page_html = BeautifulSoup(response.text, 'html.parser')
# Select all the 50 movie containers from a single page
mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
name = container.h3.a.text
names.append(name)
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# The IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# The Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
# The number of votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
votes.append(int(vote))
movie_ratings = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})
#data cleansing
movie_ratings = movie_ratings[['movie', 'year', 'imdb', 'metascore', 'votes']]
movie_ratings.head()
movie_ratings['year'].unique()
movie_ratings.to_csv('movie_ratings.csv')
Start by double checking your indentation through out (in fact - naughty naughty - it is wrong in that tutorial. I am guessing it wasn't properly proof read after publishing and the code has wrongly been left aligned repeatedly).
To illustrate, you currently have something like (reduced lines of code shown)
for year_url in years_url:
for page in pages:
response = get('http://www.imdb.com/search/title?release_date=' + year_url +'&sort=num_votes,desc&page=' + page, headers = headers)
page_html = BeautifulSoup(response.text, 'html.parser')
Your indentation means, if code runs at all, you are only working with last url you intended to visit in terms of actual html parsing.
It should be:
for year_url in years_url:
for page in pages:
response = get('http://www.imdb.com/search/title?release_date=' + year_url +'&sort=num_votes,desc&page=' + page, headers = headers)
page_html = BeautifulSoup(response.text, 'html.parser')
Indentation gives meaning in python.
https://docs.python.org/3/reference/lexical_analysis.html?highlight=indentation
Leading whitespace (spaces and tabs) at the beginning of a logical
line is used to compute the indentation level of the line, which in
turn is used to determine the grouping of statements.
It's hard to tell exactly what the issue is here because of the lack of functions but from what I see, you need to parse each page separately.
After every request, you need to parse the text. However, I suspect the main issue is the ordering of your code, I would suggest using functions.
I have managed to build a very primitive program to scrape vehicle data from pistonheads and print it to a .csv file with the link, make, model and am working on getting the price which is where I am encountering a problem.
I want to scrape the prices to the fourth column in my .csv file (Price) and to correctly print the prices from each vehicle on the website.
I am only getting it to print the price from one vehicle and repeat it again and again next to each vehicle in the .csv file.
I have tried soup.findAll and soup.find_all to see whether parsing through multiple elements would work but this is just creating a bigger mess.
Might someone be able to help?
I am also trying to scrape the image src and would like to print that on another column (5) called images.
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('pistonheads.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Link", "Make", "Model", "Price"])
url = 'https://www.pistonheads.com/classifieds?Category=used-cars&Page=1&ResultsPerPage=100'
get_url = requests.get(url)
get_text = get_url.text
soup = BeautifulSoup(get_text, 'html.parser')
car_link = soup.find_all('div', 'listing-headline', 'price')
for div in car_link:
links = div.findAll('a')
for a in links:
link = ("https://www.pistonheads.com" + a['href'])
make = (a['href'].split('/')[-4])
model = (a['href'].split('/')[-3])
price = soup.find('span')
writer.writerow([link, make, model, price])
print(link, make, model, price)
outfile.close()
You can try this:
import csv, requests, re
from urllib.parse import urlparse
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://www.pistonheads.com/classifieds?Category=used-cars&ResultsPerPage=100').text, 'html.parser')
def extract_details(_s:soup) -> list:
_link = _s.find('a', {'href':re.compile('/classifieds/used\-cars/')})['href']
_, _, make, model, *_ = _link[1:].split('/')
price, img = _s.find('div', {'class':'price'}).text, [i['src'] for i in _s.find_all('img')]
return [_link, make, model, price, 'N/A' if not img else img[0]]
with open('filename.csv', 'w') as f:
_listings = [extract_details(i) for i in d.find_all('div', {'class':'ad-listing'}) if i.find('div', {'class':'price'})]
write = csv.writer(f)
write.writerows([['make', 'model', 'price', 'img'], *_listings])
The reason is because of price = soup.find('span')
.find() will grab the first element it finds. And you have it looking into your soup object. But where you want it to look, is within your a, because that's what you are looping through with for a in links:
I also add .text as I am assuming you just want the text, not the whole tag element. Ie price = a.find('span').text
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('pistonheads.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Link", "Make", "Model", "Price", 'Images'])
url = 'https://www.pistonheads.com/classifieds?Category=used-cars&Page=1&ResultsPerPage=100'
get_url = requests.get(url)
get_text = get_url.text
soup = BeautifulSoup(get_text, 'html.parser')
car_link = soup.find_all('div', 'listing-headline', 'price')
for div in car_link:
links = div.findAll('a')
for a in links:
link = ("https://www.pistonheads.com" + a['href'])
make = (a['href'].split('/')[-4])
model = (a['href'].split('/')[-3])
price = a.find('span').text
image_link = a.parent.parent.find('img')['src']
image = link + image_link
writer.writerow([link, make, model, price, image])
print(link, make, model, price, image)
outfile.close()