Got the following code:
from bs4 import BeautifulSoup
import requests
import re
# Source Sites
mimo = 'https://tienda.mimo.com.ar/mimo/junior/ropa-para-ninas.html'
cheeky = ''
grisino = ''
source = requests.get(mimo).text
soup = BeautifulSoup(source, 'lxml')
for name_product, old_price, special_price in zip(soup.select('h3.titprod'),
soup.select('span[id^="old-price"]'),
soup.select('span[id^="product-price"]')):
print(f'Name: {name_product.text.strip()} | Old price = {old_price.text.strip()} | Discounted price = {special_price.text.strip()}')
that outputs the information perfectly:
Name: TAPABOCAS | Old price = $ 295 | Discounted price = $ 236
Name: REMERA JR TOWN | Old price = $ 990 | Discounted price = $ 743
Name: CAMISOLA NENA DELFI | Old price = $ 2.300 | Discounted price = $ 1.725
Name: CAMISOLA JR TRAFUL | Old price = $ 1.550 | Discounted price = $ 1.163
Name: VESTIDO NENA DELFI | Old price = $ 2.990 | Discounted price = $ 2.243
Name: SAQUITO JR DESAGUJADO | Old price = $ 1.990 | Discounted price = $ 1.493
Name: JEGGING JR ENGOMADO | Old price = $ 1.990 | Discounted price = $ 1.493
but...sometimes the special_price loop won't find a discounted price..so i need to make a try/except, tried to "preprocess it"..but i do not know how make it work..
special_prices_with_defaults_added = []
for sp in soup.select('span[id^="product-price"]'):
try:
special_prices_with_defaults_added.append(sp.text.strip())
except:
special_prices_with_defaults_added.append("No default price available")
for name_product, old_price, special_price in zip(
soup.select('h3.titprod'), soup.select('span[id^="old-price"]'), special_prices_with_defaults_added):
print(f'Name: {name_product.text.strip()} | Old price = {old_price.text.strip()} | Discounted price = {special_prices_with_defaults_added}')
Wrongly output:
Name: TAPABOCAS | Old price = $ 295 | Discounted price = ['$\xa0236', '$\xa0743', '$\xa01.725', '$\xa01.163', '$\xa02.243', '$\xa01.493', '$\xa01.493', '$\xa02.925', '$\xa0668', '$\xa0713', '$\xa01.688', '$\xa01.268', '$\xa0593', '$\xa0743', '$\xa01.125', '$\xa03.300', '$\xa02.175', '$\xa0743', '$\xa01.493', '$\xa0863', '$\xa0668', '$\xa0792', '$\xa01.520', '$\xa01.760', '$\xa0696', '$\xa03.150', '$\xa03.520', '$\xa0712', '$\xa01.352', '$\xa01.112', '$\xa01.112', '$\xa01.192', '$\xa02.800', '$\xa02.720', '$\xa03.920', '$\xa01.920']
Name: REMERA JR TOWN | Old price = $ 990 | Discounted price = ['$\xa0236', '$\xa0743', '$\xa01.725', '$\xa01.163', '$\xa02.243', '$\xa01.493', '$\xa01.493', '$\xa02.925', '$\xa0668', '$\xa0713', '$\xa01.688', '$\xa01.268', '$\xa0593', '$\xa0743', '$\xa01.125', '$\xa03.300', '$\xa02.175', '$\xa0743', '$\xa01.493', '$\xa0863', '$\xa0668', '$\xa0792', '$\xa01.520', '$\xa01.760', '$\xa0696', '$\xa03.150', '$\xa03.520', '$\xa0712', '$\xa01.352', '$\xa01.112', '$\xa01.112', '$\xa01.192', '$\xa02.800', '$\xa02.720', '$\xa03.920', '$\xa01.920']
As #furas said... it was just a small fix on the for loop call.
for name_product, old_price, special_price in zip(
soup.select('h3.titprod'), soup.select('span[id^="old-price"]'), special_prices_with_defaults_added):
print(
f'Name: {name_product.text.strip()} | Old price = {old_price.text.strip()} | Discounted price = {special_price}')
Related
I try to get the data from pyOWM package using city name but in some cases because of city typo error
not getting data & it breaks the process.
I want to get the weather data using lat-long but don't know how to set function for it.
Df1:
-----
User City State Zip Lat Long
-----------------------------------------------------------------------------
A Kuala Lumpur Wilayah Persekutuan 50100 5.3288907 103.1344397
B Dublin County Dublin NA 50.2030506 14.5509842
C Oconomowoc NA NA 53.3640384 -6.1953066
D Mumbai Maharashtra 400067 19.2177166 72.9708833
E Mratin Stredocesky kraj 250 63 40.7560585 -5.6924778
.
.
.
----------------------------------
Code:
--------
import time
from tqdm.notebook import tqdm
import pyowm
from pyowm.utils import config
from pyowm.utils import timestamps
cities = Df1["City"].unique().tolist()
cities1 = cities [:5]
owm = pyowm.OWM('bee8db7d50a4b777bfbb9f47d9beb7d0')
mgr = owm.weather_manager()
'''
Step-1 Define list where save the data
'''
list_wind_Speed =[]
list_tempreture =[]
list_max_temp =[]
list_min_temp =[]
list_humidity =[]
list_pressure =[]
list_city = []
list_cloud=[]
list_status =[]
list_rain =[]
'''
Step-2 Fetch data
'''
j=0
for city in tqdm(cities1):
j=+1
if j < 60:
# one_call_obs = owm.weather_at_coords(52.5244, 13.4105).weather
# one_call_obs.current.humidity
observation = mgr.weather_at_place(city)
l = observation.weather
list_city.append(city)
list_wind_Speed.append(l.wind()['speed'])
list_tempreture.append(l.temperature('celsius')['temp'])
list_max_temp.append(l.temperature('celsius')['temp_max'])
list_min_temp.append(l.temperature('celsius')['temp_min'])
list_humidity.append(l.humidity)
list_pressure.append(l.pressure['press'])
list_cloud.append(l.clouds)
list_rain.append(l.rain)
else:
time.sleep(60)
j=0
'''
Step-3 Blank data frame and store data in that
'''
df2 = pd.DataFrame()
df2["City"] = list_city
df2["Temp"] = list_tempreture
df2["Max_Temp"] = list_max_temp
df2["Min_Temp"] = list_min_temp
df2["Cloud"] = list_cloud
df2["Humidity"] = list_humidity
df2["Pressure"] = list_pressure
df2["Status"] = list_status
df2["Rain"] = list_status
df2
From the above code, I get the result as below,
City | Temp |Max_Temp|Min_Temp|Cloud |Humidity|Pressure |Status | Rain
------------------------------------------------------------------------------------------
Kuala Lumpur|29.22 |30.00 |27.78 | 20 |70 |1007 | moderate rain | moderate rain
Dublin |23.12 |26.43 |22.34 | 15 |89 | 978 | cloudy | cloudy
...
Now because of some city typo error processes getting stop,
Looking for an alternate solution of it and try to get weather data from Lat-Long but don't know how to set function for pass lat & long column data.
Df1 = {'User':['A','B','C','D','E'],
'City':['Kuala Lumpur','Dublin','Oconomowoc','Mumbai','Mratin'],
'State':['Wilayah Persekutuan','County Dublin',NA,1'Maharashtra','Stredocesky kraj'],
'Zip': [50100,NA,NA,400067,250 63],
'Lat':[5.3288907,50.2030506,53.3640384,19.2177166,40.7560585],
'Long':[103.1344397,14.5509842,-6.1953066,72.9708833,-5.6924778]}
# Try to use this code to get wather data
# one_call_obs = owm.weather_at_coords(52.5244, 13.4105).weather
# one_call_obs.current.humidity
Expected Result
--------------
User | City | Lat | Long | Temp | Cloud | Humidity | Pressure | Rain | Status
-----------------------------------------------------------------------------
Catch the error if a city is not found, parse the lat/lon from the dataframe. Use that lat/lon to create a bounding box and use weather_at_places_in_bbox to get a list of observations in that area.
import time
from tqdm.notebook import tqdm
import pyowm
from pyowm.utils import config
from pyowm.utils import timestamps
import pandas as pd
from pyowm.commons.exceptions import NotFoundError, ParseAPIResponseError
df1 = pd.DataFrame({'City': ('Kuala Lumpur', 'Dublin', 'Oconomowoc', 'Mumbai', 'C airo', 'Mratin'),
'Lat': ('5.3288907', '50.2030506', '53.3640384', '19.2177166', '30.22', '40.7560585'),
'Long': ('103.1344397', '14.5509842', '-6.1953066', '72.9708833', '31', '-5.6924778')})
cities = df1["City"].unique().tolist()
owm = pyowm.OWM('bee8db7d50a4b777bfbb9f47d9beb7d0')
mgr = owm.weather_manager()
for city in cities:
try:
observation = mgr.weather_at_place(city)
# print(city, observation)
except NotFoundError:
# get city by lat/lon
lat_top = float(df1.loc[df1['City'] == city, 'Lat'])
lon_left = float(df1.loc[df1['City'] == city, 'Long'])
lat_bottom = lat_top - 0.3
lon_right = lon_left + 0.3
try:
observations = mgr.weather_at_places_in_bbox(lon_left, lat_bottom, lon_right, lat_top, zoom=5)
observation = observations[0]
except ParseAPIResponseError:
raise RuntimeError(f"Couldn't find {city} at lat: {lat_top} / lon: {lon_right}, try tweaking the bounding box")
weather = observation.weather
temp = weather.temperature('celsius')['temp']
print(f"The current temperature in {city} is {temp}")
I am trying to scrape Craigslist using BeautifulSoup4. All data shows properly EXCEPT price. I can't seem to find the right tagging to loop through pricing instead of showing the same price for each post.
import requests
from bs4 import BeautifulSoup
source = requests.get('https://washingtondc.craigslist.org/search/nva/sss?query=5%20hp%20boat%20motor&sort=rel').text
soup = BeautifulSoup(source, 'lxml')
for summary in soup.find_all('p', class_='result-info'):
pricing = soup.find('span', class_='result-price')
price = pricing
title = summary.a.text
url = summary.a['href']
print(title + '\n' + price.text + '\n' + url + '\n')
Left: HTML code from Craigslist, commented out is irrelevant (in my opinion) code. I want pricing to not loop the same number. Right: Sublime SS of code.
Snippet of code running through terminal. Pricing is the same for each post.
Thank you
Your script is almost correct. You need to change the soup object for the price to summary
import requests
from bs4 import BeautifulSoup
source = requests.get('https://washingtondc.craigslist.org/search/nva/sss?query=5%20hp%20boat%20motor&sort=rel').text
soup = BeautifulSoup(source, 'lxml')
for summary in soup.find_all('p', class_='result-info'):
price = summary.find('span', class_='result-price')
title = summary.a.text
url = summary.a['href']
print(title + '\n' + price.text + '\n' + url + '\n')
Output:
Boat Water Tender - 10 Tri-Hull with Electric Trolling Motor
$629
https://washingtondc.craigslist.org/nva/boa/d/haymarket-boat-water-tender-10-tri-hull/7160572264.html
1987 Boston Whaler Montauk 17
$25450
https://washingtondc.craigslist.org/nva/boa/d/alexandria-1987-boston-whaler-montauk-17/7163033134.html
1971 Westerly Warwick Sailboat
$3900
https://washingtondc.craigslist.org/mld/boa/d/upper-marlboro-1971-westerly-warwick/7170495800.html
Buy or Rent. DC Party Pontoon for Dock Parties or Cruises
$15000
https://washingtondc.craigslist.org/doc/boa/d/washington-buy-or-rent-dc-party-pontoon/7157810378.html
West Marine Zodiac Inflatable Boat SB285 With 5HP Gamefisher (Merc)
$850
https://annapolis.craigslist.org/boa/d/annapolis-west-marine-zodiac-inflatable/7166031908.html
2012 AB aluminum/hypalon inflatable dinghy/2012 Yamaha 6hp four stroke
$3400
https://annapolis.craigslist.org/bpo/d/annapolis-2012-ab-aluminum-hypalon/7157768911.html
RHODES-18’ CENTERBOARD DAYSAILER
$6500
https://annapolis.craigslist.org/boa/d/ocean-view-rhodes-18-centerboard/7148322078.html
Mercury Outboard 7.5 HP
$250
https://baltimore.craigslist.org/bpo/d/middle-river-mercury-outboard-75-hp/7167399866.html
8 hp yamaha 2 stroke
$0
https://baltimore.craigslist.org/bpo/d/8-hp-yamaha-2-stroke/7154103281.html
TRADE 38' BENETEAU IDYLLE 1150
$35000
https://baltimore.craigslist.org/boa/d/middle-river-trade-38-beneteau-idylle/7163761741.html
5-hp Top Tank Mercury
$0
https://baltimore.craigslist.org/bpo/d/5-hp-top-tank-mercury/7154102434.html
5-hp Top Tank Mercury
$0
https://baltimore.craigslist.org/bpo/d/5-hp-top-tank-mercury/7154102744.html
Wanted ur unwanted outboards
$0
https://baltimore.craigslist.org/bpo/d/randallstown-wanted-ur-unwanted/7141349142.html
Grumman Sport Boat
$2250
https://baltimore.craigslist.org/boa/d/baldwin-grumman-sport-boat/7157186381.html
1996 Carver 355 Aft Cabin Motor Yacht
$47000
https://baltimore.craigslist.org/boa/d/middle-river-1996-carver-355-aft-cabin/7156830617.html
Lower unit, long shaft
$50
https://baltimore.craigslist.org/bpo/d/catonsville-lower-unit-long-shaft/7155566763.html
Lower unit, long shaft
$50
https://baltimore.craigslist.org/bpo/d/catonsville-lower-unit-long-shaft/7155565771.html
Lower unit, long shaft
$50
https://baltimore.craigslist.org/bpo/d/catonsville-lower-unit-long-shaft/7155566035.html
Lower unit, long shaft
$50
https://baltimore.craigslist.org/bpo/d/catonsville-lower-unit-long-shaft/7155565301.html
Cape Dory 25 Sailboat for sale or trade
$6500
https://baltimore.craigslist.org/boa/d/reedville-cape-dory-25-sailboat-for/7149227778.html
West Marine HP-V 350
$1200
https://baltimore.craigslist.org/boa/d/pasadena-west-marine-hp-350/7147285666.html
I have two infoboxes that look exactly the same to me, but I'm getting different behavior in mwparserfromhell. In the first instance I'm getting what I expect - the entire infobox is captured as a template object. In the second instance parts of the infobox are extracted as separate templates. This is confusing since the infoboxes look very similar to me, and I was expecting that the entire infobox could be extracted in the second case.
This is the code I'm using:
mwparserfromhell.parse(text.strip().lower()).filter_templates()
Text 1 Input:
txt1 = """{{Infobox building
| name = 666 Fifth Avenue
| former_names = Tishman Building
| status = Complete
| image = 666 Fifth Avenue by David Shankbone.jpg
| image_size = 300px
| caption =
| location = 666 Fifth Avenue<br>[[Manhattan]], [[New York (state)|New York]] 10103
| coordinates = {{coord|40.760163|-73.976204|format=dms}}
| start_date =
| completion_date = 1957
| architect = [[Carson & Lundin]]
| owner = [[Brookfield Properties]]
| cost = $40 million
| floor_area = {{convert|1,463,892|sqft|m2|abbr=on}}
| top_floor =
| floor_count = 41
| references =
| map_type =
| building_type = Office
| antenna_spire =
| roof = {{convert|483|ft|m|abbr=on}}
| elevator_count = 24 (20 passenger, 4 freight)
| structural_engineer =
| main_contractor =
| opening = November 25, 1957
| developer = Tishman Realty and Construction
| management =
}}"""
Text 1 Output:
['{{infobox building\n| name = 666 fifth avenue\n| former_names = tishman building\n| status = complete\n| image = 666 fifth avenue by david shankbone.jpg\n| image_size = 300px\n| caption = \n| location = 666 fifth avenue<br>[[manhattan]], [[new york (state)|new york]] 10103\n| coordinates = {{coord|40.760163|-73.976204|format=dms}}\n| start_date = \n| completion_date = 1957\n| architect = [[carson & lundin]]\n| owner = [[brookfield properties]]\n| cost = $40 million\n| floor_area = {{convert|1,463,892|sqft|m2|abbr=on}}\n| top_floor = \n| floor_count = 41\n| references = \n| map_type = \n| building_type = office\n| antenna_spire = \n| roof = {{convert|483|ft|m|abbr=on}}\n| elevator_count = 24 (20 passenger, 4 freight)\n| structural_engineer = \n| main_contractor = \n| opening = november 25, 1957\n| developer = tishman realty and construction\n| management = \n}}',
'{{coord|40.760163|-73.976204|format=dms}}',
'{{convert|1,463,892|sqft|m2|abbr=on}}',
'{{convert|483|ft|m|abbr=on}}']
Text 2 Input:
txt2 = """{{Infobox building
| name = Central Park Tower
| alternate_names = Nordstrom Tower
| image = Central Park Tower April 2020.jpg
| caption = Central Park Tower on April 25, 2020
| location = 225 [[57th Street (Manhattan)|West 57th Street]]<br/>[[Manhattan]], [[New York City]], [[New York (state)|New York]], [[United States|U.S.]]
| coordinates = {{coord|40.7663|-73.9810|type:landmark_globe:earth_region:US-NY|display=inline,title}}
| status = Topped Out
| start_date = 2014
| est_completion = 2020<ref name=curbed>{{cite news |author=Amy Plitt |url=https://ny.curbed.com/2017/6/1/15714666/central-park-tower-offering-plan-approval-sales-launch |title=Central Park Tower is now one step closer to launching sales |date=June 1, 2017 |access-date=August 30, 2017 |work=Curbed}}</ref>
| building_type = [[Residential]], [[retail]]
| architectural_style = [[Modern architecture|Modern]]
| architectural = {{cvt|1550|ft|0}}
| floor_count = 131<ref>{{cite web |url=https://www.architecturaldigest.com/story/new-york-city-central-park-tower-worlds-tallest-residential-building </ref><ref>{{cite web |url=https://archpaper.com/2019/09/central-park-tower-tops-out/</ref> (98 habitable floors)<ref name="auto">{{Cite web |url=http://www.skyscrapercenter.com/building/central-park-tower/14269 |title=Central Park Tower - The Skyscraper Center |website=www.skyscrapercenter.com |access-date=October 10, 2018}}</ref>
| elevator_count = 11
| cost = $3 billion<ref name="Tase">{{cite news|url=https://commercialobserver.com/2019/04/all-in-good-tase-the-crisis-for-the-american-cohort-in-tel-aviv-is-essentially-over/|title=All in Good TASE: The Crisis for the American Cohort in Tel Aviv Is Essentially Over|date=April 4, 2019|work=Commercial Observer|last=Gourarie|first=Chava}}</ref>
| floor_area = {{convert|1,285,308|sqft|m2}}<ref name="auto" />
| architect = [[Adrian Smith + Gordon Gill Architecture]]
| structural_engineer = [[WSP Global]]
| main_contractor = [[Lendlease]]
| developer = [[Extell Development Company]]
}}"""
Text 2 Output:
['{{coord|40.7663|-73.9810|type:landmark_globe:earth_region:us-ny|display=inline,title}}',
'{{cite news |author=amy plitt |url=https://ny.curbed.com/2017/6/1/15714666/central-park-tower-offering-plan-approval-sales-launch |title=central park tower is now one step closer to launching sales |date=june 1, 2017 |access-date=august 30, 2017 |work=curbed}}',
'{{cvt|1550|ft|0}}',
'{{cite web |url=https://archpaper.com/2019/09/central-park-tower-tops-out/</ref> (98 habitable floors)<ref name="auto">{{cite web |url=http://www.skyscrapercenter.com/building/central-park-tower/14269 |title=central park tower - the skyscraper center |website=www.skyscrapercenter.com |access-date=october 10, 2018}}</ref>\n| elevator_count = 11\n| cost = $3 billion<ref name="tase">{{cite news|url=https://commercialobserver.com/2019/04/all-in-good-tase-the-crisis-for-the-american-cohort-in-tel-aviv-is-essentially-over/|title=all in good tase: the crisis for the american cohort in tel aviv is essentially over|date=april 4, 2019|work=commercial observer|last=gourarie|first=chava}}</ref>\n| floor_area = {{convert|1,285,308|sqft|m2}}<ref name="auto" />\n| architect = [[adrian smith + gordon gill architecture]]\n| structural_engineer = [[wsp global]]\n| main_contractor = [[lendlease]]\n| developer = [[extell development company]]\n}}',
'{{cite web |url=http://www.skyscrapercenter.com/building/central-park-tower/14269 |title=central park tower - the skyscraper center |website=www.skyscrapercenter.com |access-date=october 10, 2018}}',
'{{cite news|url=https://commercialobserver.com/2019/04/all-in-good-tase-the-crisis-for-the-american-cohort-in-tel-aviv-is-essentially-over/|title=all in good tase: the crisis for the american cohort in tel aviv is essentially over|date=april 4, 2019|work=commercial observer|last=gourarie|first=chava}}',
'{{convert|1,285,308|sqft|m2}}']
This is a known bug for mwparserfromhell. My workaround was to create an on-the-fly regex pattern that would remove the ref link but keep the rest of the text intact.
{{ this is text <ref>this is a ref link}}</ref>
to
{{ this is text }}
Here's the code:
def get_regex_str_pattern(reg_str):
""" Creates regex string to remove specific patterns that are embedded in larger strings
:param reg_str: String to tokenize
:return: Regex pattern
"""
return r'.*'.join([f"({re.escape(r.strip())})" for r in reg_str.split()])
def get_ref_clean_str(txt):
""" Removes badly formed ref strings from wiki text
:param txt: Wiki text
:return: Parser object
"""
clean_txt = txt
wiki_text = mwparserfromhell.parse(txt)
for r in wiki_text.filter_tags():
if str(r.tag) in ('ref', 'br'):
clean_txt = re.sub(get_regex_str_pattern(r), ' ', clean_txt)
return mwparserfromhell.parse(clean_txt)
I'm trying to scrape Autotrader's website to get an excel of the stats and names.
I'm stuck at trying to loop through an html 'ul' element without any classes or IDs and organize that info in python list to then append the individual li elements to different fields in my table.
As you can see I'm able to target the title and price elements, but the 'ul' is really tricky... Well... for someone at my skill level.
The specific code I'm struggling with:
for i in range(1, 2):
response = get('https://www.autotrader.co.uk/car-search?sort=sponsored&seller-type=private&page=' + str(i))
html_soup = BeautifulSoup(response.text, 'html.parser')
ad_containers = html_soup.find_all('h2', class_ = 'listing-title title-wrap')
price_containers = html_soup.find_all('section', class_ = 'price-column')
for container in ad_containers:
name = container.find('a', class_ ="js-click-handler listing-fpa-link").text
names.append(name)
# Trying to loop through the key specs list and assigned each 'li' to a different field in the table
lis = []
list_container = container.find('ul', class_='listing-key-specs')
for li in list_container.find('li'):
lis.append(li)
year.append(lis[0])
body_type.append(lis[1])
milage.append(lis[2])
engine.append(lis[3])
hp.append(lis[4])
transmission.append(lis[5])
petrol_type.append(lis[6])
lis = [] # Clearing dictionary to get ready for next set of data
And the error message I get is the following:
Full code here:
from requests import get
from bs4 import BeautifulSoup
import pandas
# from time import sleep, time
# import random
# Create table fields
names = []
prices = []
year = []
body_type = []
milage = []
engine = []
hp = []
transmission = []
petrol_type = []
for i in range(1, 2):
# Make a get request
response = get('https://www.autotrader.co.uk/car-search?sort=sponsored&seller-type=private&page=' + str(i))
# Pause the loop
# sleep(random.randint(4, 7))
# Create containers
html_soup = BeautifulSoup(response.text, 'html.parser')
ad_containers = html_soup.find_all('h2', class_ = 'listing-title title-wrap')
price_containers = html_soup.find_all('section', class_ = 'price-column')
for container in ad_containers:
name = container.find('a', class_ ="js-click-handler listing-fpa-link").text
names.append(name)
# Trying to loop through the key specs list and assigned each 'li' to a different field in the table
lis = []
list_container = container.find('ul', class_='listing-key-specs')
for li in list_container.find('li'):
lis.append(li)
year.append(lis[0])
body_type.append(lis[1])
milage.append(lis[2])
engine.append(lis[3])
hp.append(lis[4])
transmission.append(lis[5])
petrol_type.append(lis[6])
lis = [] # Clearing dictionary to get ready for next set of data
for pricteainers in price_containers:
price = pricteainers.find('div', class_ ='vehicle-price').text
prices.append(price)
test_df = pandas.DataFrame({'Title': names, 'Price': prices, 'Year': year, 'Body Type': body_type, 'Mileage': milage, 'Engine Size': engine, 'HP': hp, 'Transmission': transmission, 'Petrol Type': petrol_type})
print(test_df.info())
# test_df.to_csv('Autotrader_test.csv')
I followed the advice from David in the other answer's comment area.
Code:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.width', 1000)
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
names = []
prices = []
year = []
body_type = []
milage = []
engine = []
hp = []
transmission = []
petrol_type = []
for i in range(1, 2):
response = get('https://www.autotrader.co.uk/car-search?sort=sponsored&seller-type=private&page=' + str(i))
html_soup = BeautifulSoup(response.text, 'html.parser')
outer = html_soup.find_all('article', class_='search-listing')
for inner in outer:
lis = []
names.append(inner.find_all('a', class_ ="js-click-handler listing-fpa-link")[1].text)
prices.append(inner.find('div', class_='vehicle-price').text)
for li in inner.find_all('ul', class_='listing-key-specs'):
for i in li.find_all('li')[-7:]:
lis.append(i.text)
year.append(lis[0])
body_type.append(lis[1])
milage.append(lis[2])
engine.append(lis[3])
hp.append(lis[4])
transmission.append(lis[5])
petrol_type.append(lis[6])
test_df = pd.DataFrame.from_dict({'Title': names, 'Price': prices, 'Year': year, 'Body Type': body_type, 'Mileage': milage, 'Engine Size': engine, 'HP': hp, 'Transmission': transmission, 'Petrol Type': petrol_type}, orient='index')
print(test_df.transpose())
Output:
Title Price Year Body Type Mileage Engine Size HP Transmission Petrol Type
0 Citroen C3 1.4 HDi Exclusive 5dr £500 2002 (52 reg) Hatchback 123,065 miles 1.4L 70bhp Manual Diesel
1 Volvo V40 1.6 XS 5dr £585 1999 (V reg) Estate 125,000 miles 1.6L 109bhp Manual Petrol
2 Toyota Yaris 1.3 VVT-i 16v GLS 3dr £700 2000 (W reg) Hatchback 94,000 miles 1.3L 85bhp Automatic Petrol
3 MG Zt-T 2.5 190 + 5dr £750 2002 (52 reg) Estate 95,000 miles 2.5L 188bhp Manual Petrol
4 Volkswagen Golf 1.9 SDI E 5dr £795 2001 (51 reg) Hatchback 153,000 miles 1.9L 68bhp Manual Diesel
5 Volkswagen Polo 1.9 SDI Twist 5dr £820 2005 (05 reg) Hatchback 106,116 miles 1.9L 64bhp Manual Diesel
6 Volkswagen Polo 1.4 S 3dr (a/c) £850 2002 (02 reg) Hatchback 125,640 miles 1.4L 75bhp Manual Petrol
7 KIA Picanto 1.1 LX 5dr £990 2005 (05 reg) Hatchback 109,000 miles 1.1L 64bhp Manual Petrol
8 Vauxhall Corsa 1.2 i 16v SXi 3dr £995 2004 (54 reg) Hatchback 81,114 miles 1.2L 74bhp Manual Petrol
9 Volkswagen Beetle 1.6 3dr £995 2003 (53 reg) Hatchback 128,000 miles 1.6L 102bhp Manual Petrol
The ul is not a child of the h2 . It's a sibling.
So you will need to make a separate selection because it's not part of the ad_containers.
URL: http://www.imdb.com/chart/?ref_=nv_ch_cht_2
I want you to print top box office list from above site (all the movies' rank, title, weekend, gross and weeks movies in the order)
Example output:
Rank:1
title: godzilla
weekend:$93.2M
Gross:$93.2M
Weeks: 1
Rank: 2
title: Neighbours
This is just a simple way to extract those entities by BeautifulSoup
from bs4 import BeautifulSoup
import urllib2
url = "http://www.imdb.com/chart/?ref_=nv_ch_cht_2"
data = urllib2.urlopen(url).read()
page = BeautifulSoup(data, 'html.parser')
rows = page.findAll("tr", {'class': ['odd', 'even']})
for tr in rows:
for data in tr.findAll("td", {'class': ['titleColumn', 'weeksColumn','ratingColumn']}):
print data.get_text()
P.S.-Arrange according to your will.
There is no need to scrape anything. See the answer I gave here.
How to scrape data from imdb business page?
The below Python script will give you, 1) List of Top Box Office movies from IMDb 2) And also the List of Cast for each of them.
from lxml.html import parse
def imdb_bo(no_of_movies=5):
bo_url = 'http://www.imdb.com/chart/'
bo_page = parse(bo_url).getroot()
bo_table = bo_page.cssselect('table.chart')
bo_total = len(bo_table[0][2])
if no_of_movies <= bo_total:
count = no_of_movies
else:
count = bo_total
movies = {}
for i in range(0, count):
mo = {}
mo['url'] = 'http://www.imdb.com'+bo_page.cssselect('td.titleColumn')[i][0].get('href')
mo['title'] = bo_page.cssselect('td.titleColumn')[i][0].text_content().strip()
mo['year'] = bo_page.cssselect('td.titleColumn')[i][1].text_content().strip(" ()")
mo['weekend'] = bo_page.cssselect('td.ratingColumn')[i*2].text_content().strip()
mo['gross'] = bo_page.cssselect('td.ratingColumn')[(i*2)+1][0].text_content().strip()
mo['weeks'] = bo_page.cssselect('td.weeksColumn')[i].text_content().strip()
m_page = parse(mo['url']).getroot()
m_casttable = m_page.cssselect('table.cast_list')
flag = 0
mo['cast'] = []
for cast in m_casttable[0]:
if flag == 0:
flag = 1
else:
m_starname = cast[1][0][0].text_content().strip()
mo['cast'].append(m_starname)
movies[i] = mo
return movies
if __name__ == '__main__':
no_of_movies = raw_input("Enter no. of Box office movies to display:")
bo_movies = imdb_bo(int(no_of_movies))
for k,v in bo_movies.iteritems():
print '#'+str(k+1)+' '+v['title']+' ('+v['year']+')'
print 'URL: '+v['url']
print 'Weekend: '+v['weekend']
print 'Gross: '+v['gross']
print 'Weeks: '+v['weeks']
print 'Cast: '+', '.join(v['cast'])
print '\n'
Output (run in terminal):
parag#parag-innovate:~/python$ python imdb_bo_scraper.py
Enter no. of Box office movies to display:3
#1 Cinderella (2015)
URL: http://www.imdb.com/title/tt1661199?ref_=cht_bo_1
Weekend: $67.88M
Gross: $67.88M
Weeks: 1
Cast: Cate Blanchett, Lily James, Richard Madden, Helena Bonham Carter, Nonso Anozie, Stellan Skarsgård, Sophie McShera, Holliday Grainger, Derek Jacobi, Ben Chaplin, Hayley Atwell, Rob Brydon, Jana Perez, Alex Macqueen, Tom Edden
#2 Run All Night (2015)
URL: http://www.imdb.com/title/tt2199571?ref_=cht_bo_2
Weekend: $11.01M
Gross: $11.01M
Weeks: 1
Cast: Liam Neeson, Ed Harris, Joel Kinnaman, Boyd Holbrook, Bruce McGill, Genesis Rodriguez, Vincent D'Onofrio, Lois Smith, Common, Beau Knapp, Patricia Kalember, Daniel Stewart Sherman, James Martinez, Radivoje Bukvic, Tony Naumovski
#3 Kingsman: The Secret Service (2014)
URL: http://www.imdb.com/title/tt2802144?ref_=cht_bo_3
Weekend: $6.21M
Gross: $107.39M
Weeks: 5
Cast: Adrian Quinton, Colin Firth, Mark Strong, Jonno Davies, Jack Davenport, Alex Nikolov, Samantha Womack, Mark Hamill, Velibor Topic, Sofia Boutella, Samuel L. Jackson, Michael Caine, Taron Egerton, Geoff Bell, Jordan Long