webscraping script for booking.com doesn't work

webscraping script for booking.com doesn't work - python

I made a script to scrape hotel name, rating and perks from hotels on this page : link
Here's my script :
import numpy as np
import time
from random import randint
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import random
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'https://www.espncricinfo.com/',
'Upgrade-Insecure-Requests': '1',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
url0 = 'https://www.booking.com/searchresults.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city&'
links1 = []
results = requests.get(url0, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
links1 = [a['href'] for a in soup.find("div", {"class": "hotellist sr_double_search"}).find_all('a', class_ = 'js-sr-hotel-link hotel_name_link url', href=True)]
root_url = 'https://www.booking.com/'
urls1 = [ '{root}{i}'.format(root=root_url, i=i) for i in links1 ]
pointforts = []
hotels = []
notes = []
for url in urls1:
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
try :
div = soup.find("div", {"class": "hp_desc_important_facilities clearfix hp_desc_important_facilities--bui"})
pointfort = [x['data-name-en'] for x in div.select('div[class*="important_facility"]')]
pointforts.append(pointfort)
except:
pointforts.append('Nan')
try:
note = soup.find('div', class_ = 'bui-review-score__badge').text
notes.append(note)
except:
notes.append('Nan')
try:
hotel = soup.find("h2",attrs={"id":"hp_hotel_name"}).text.strip("\n").split("\n")[1]
hotels.append(hotel)
except:
hotels.append('Nan')
data = pd.DataFrame({
'Notes' : notes,
'Points fort' : pointforts,
'Nom' : hotels})
#print(data.head(20))
data.to_csv('datatest.csv', sep=';', index=False, encoding = 'utf_8_sig')
It worked, I made a loop to scrape all the links for the hotel and after scrape ratings and perks for all of those hotels. But I had doublons, so instead of :
links1 = [a['href'] for a in soup.find("div", {"class": "hotellist sr_double_search"}).find_all('a', href=True)]
I put : links1 = [a['href'] for a in soup.find("div", {"class": "hotellist sr_double_search"}).find_all('a', class_ = 'js-sr-hotel-link hotel_name_link url', href=True)] as you can see in my script above.
But now it doesn't work anymore, I obtained only Nan, while before, when I had doublons, I have some with Nan but most of them have the perks I wanted and the ratings. I don't understand why.
Here's the html for the hotels links :
hotellink
Here's the html to get the name (after I obtaine the link, the script go to this link) :
namehtml
And here's the html to get all the perks related to the hotel (Like the name, the script go to the link I scraped before) :
perkshtml
And here's my result...
output

The href tags on that website contain newlines. One at the start and also some mid way through. As such when you try and combine root_url you are not getting valid URLs.
A fix can be to remove all the newlines. As the href always starts with a / this can also be removed from the root_url, or you could use urllib.parse.urljoin().
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'https://www.espncricinfo.com/',
'Upgrade-Insecure-Requests': '1',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
url0 = 'https://www.booking.com/searchresults.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city&'
results = requests.get(url0, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
links1 = [a['href'].replace('\n','') for a in soup.find("div", {"class": "hotellist sr_double_search"}).find_all('a', class_ = 'js-sr-hotel-link hotel_name_link url', href=True)]
root_url = 'https://www.booking.com'
urls1 = [f'{root_url}{i}' for i in links1]
pointforts = []
hotels = []
notes = []
for url in urls1:
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
try:
div = soup.find("div", {"class": "hp_desc_important_facilities clearfix hp_desc_important_facilities--bui"})
pointfort = [x['data-name-en'] for x in div.select('div[class*="important_facility"]')]
pointforts.append(pointfort)
except:
pointforts.append('Nan')
try:
note = soup.find('div', class_ = 'bui-review-score__badge').text
notes.append(note)
except:
notes.append('Nan')
try:
hotel = soup.find("h2",attrs={"id":"hp_hotel_name"}).text.strip("\n").split("\n")[1]
hotels.append(hotel)
except:
hotels.append('Nan')
data = pd.DataFrame({
'Notes' : notes,
'Points fort' : pointforts,
'Nom' : hotels})
#print(data.head(20))
data.to_csv('datatest.csv', sep=';', index=False, encoding = 'utf_8_sig')
This would give you an output CSV file starting:
Notes;Points fort;Nom
8,3 ;['Parking (fee required)', 'Free WiFi Internet Access Included', 'Family Rooms', 'Airport Shuttle', 'Non Smoking Rooms', '24 hour Front Desk', 'Bar'];Elysées Union
8,4 ;['Free WiFi Internet Access Included', 'Family Rooms', 'Non Smoking Rooms', 'Pets allowed', '24 hour Front Desk', 'Rooms/Facilities for Disabled'];Hyatt Regency Paris Etoile
8,3 ;['Free WiFi Internet Access Included', 'Family Rooms', 'Non Smoking Rooms', 'Pets allowed', 'Restaurant', '24 hour Front Desk', 'Bar'];Pullman Paris Tour Eiffel
8,7 ;['Free WiFi Internet Access Included', 'Non Smoking Rooms', 'Restaurant', '24 hour Front Desk', 'Rooms/Facilities for Disabled', 'Elevator', 'Bar'];citizenM Paris Gare de Lyon

Related

How to select and scrape specific texts out of a bunch <ul> and <li>?

I need to scrape "2015" and "09/09/2015" from the below link:
lacentrale.fr/auto-occasion-annonce-87102353714.html
But since there are many li and ul, I cant scrape the exact text. I used the below code Your help is highly appreciated.
from bs4 import BeautifulSoup
soup = BeautifulSoup(HTML)
soup.find('span', {'class':'optionLabel'}).find_next('span').get_text()

Fan of css selectors and :-soup-contains() as in #Andrejs answer mentioned. So just in case an alternative approach, if it comes to the point there are more options needed.
Generate a dict with all options pick the relevant value, by option label as key:
data = dict((e.button.text,e.find_next('span').text) for e in soup.select('.optionLabel'))
data lokks like:
{'Année': '2015', 'Mise en circulation': '09/09/2015', 'Contrôle technique': 'requis', 'Kilométrage compteur': '68 736 Km', 'Énergie': 'Electrique', 'Rechargeable': 'oui', 'Autonomie batterie': '190 Km', 'Capacité batterie': '22 kWh', 'Boîte de vitesse': 'automatique', 'Couleur extérieure': 'gris foncé metal', 'Couleur intérieure': 'cuir noir', 'Nombre de portes': '5', 'Nombre de places': '4', 'Garantie': '6 mois', 'Première main (déclaratif)': 'non', 'Nombre de propriétaires': '2', 'Puissance fiscale': '3 CV', 'Puissance din': '102 ch', 'Puissance moteur': '125 kW', "Crit'Air": '0', 'Émissions de CO2': '0 g/kmA', 'Norme Euro': 'EURO6', 'Prime à la conversion': ''}
Example
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'}
url = 'https://www.lacentrale.fr/auto-occasion-annonce-87102353714.html'
soup = BeautifulSoup(requests.get(url, headers=headers).text)
data = dict((e.button.text,e.find_next('span').text) for e in soup.select('.optionLabel'))
print(data['Année'], data['Mise en circulation'], sep='\n')
Output
2015
09/09/2015

Try:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0"
}
url = "https://www.lacentrale.fr/auto-occasion-annonce-87102353714.html"
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
v1 = soup.select_one('.optionLabel:-soup-contains("Année") + span')
v2 = soup.select_one(
'.optionLabel:-soup-contains("Mise en circulation") + span'
)
print(v1.text)
print(v2.text)
Prints:
2015
09/09/2015

Not able to extract data or find problem with code

Below is the code for extracting news articles for different companies from Google news. This gives me an empty excel file with only the headers. I am not able to figure out what the error is. Can someone please help out. (The entire code can be copy-pasted, and reproduced in your machine):
import requests
import random
from collections import OrderedDict
def list_header():
headers_list = [
# Firefox 24 Linux
{
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
},
# Firefox Mac
{
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}]
return headers_list
def list_dict():
# Get headers list
headers_list = list_header()
# Create ordered dict from Headers above
ordered_headers_list = []
for headers in headers_list:
h = OrderedDict()
for header,value in headers.items():
h[header]=value
ordered_headers_list.append(h)
return ordered_headers_list
def list_test():
headers_list = list_dict()
max = len(headers_list)
url = 'https://httpbin.org/headers'
for i in range(0,max):
#Pick a random browser headers
headers = random.choice(headers_list)
#Create a request session
r = requests.Session()
r.headers = headers
response = r.get(url)
print("Request #%d\nUser-Agent Sent:%s\n\nHeaders Recevied by HTTPBin:"%(i,headers['User-Agent']))
print(response.json())
print("-------------------")
def random_header():
headers_list = list_dict()
headers = random.choice(headers_list)
return headers
import pandas as pd
def ingest_google_news():
ticker_list = ['AAPL.O', 'MSFT', 'GOOG', '2222.SR', 'AMZN', 'FB', 'TSLA', 'BRK-A', 'TCEHY', 'TSM', 'NVDA', 'JPM', 'BABA', 'V', 'JNJ', '005930.KS', 'WMT', 'LVMUY']
sep = '.'
df = pd.DataFrame()
t_news = []
t_publisher = []
t_urls = []
t_dates = []
t_tickers = []
for t in ticker_list:
news = []
publisher = []
urls = []
dates = []
tickers = []
# cleaning ticker
ticker = t
t = t.split(sep, 1)[0]
# set header by random user agent
r = requests.Session()
headers = random_header()
r.headers = headers
# print(headers)
# set query for google
query = '{} news'.format(t)
url = f"https://www.google.com/search?q={query}&tbm=nws&lr=lang_en&hl=en&sort=date&num=5"
res = r.get(url, headers=headers)
soup = bs4.BeautifulSoup(res.text, "html.parser")
links = soup.select(".dbsr a")
for l in links:
tickers.append(t)
try:
url_w = l.get("href")
print(url_w)
urls.append(url_w)
dt = find_date(url_w)
dates.append(dt)
res = requests.get(url_w, headers=headers)
parsed_article = bs4.BeautifulSoup(res.text,'lxml')
paragraphs = parsed_article.find_all('p')
article_text = ""
for p in paragraphs:
article_text += p.text
except Exception as e:
article_text = ''
news.append(article_text)
sources = soup.select(".XTjFC g-img")
for s in sources:
publisher.append(s.next_sibling.lower())
t_urls += urls
t_news += news
t_publisher += publisher
t_dates += dates
t_tickers += tickers
df['ticker'] = t_tickers
df['links'] = t_urls
df['article_text'] = t_news
df['publisher'] = t_publisher
df['created_at'] = t_dates
# import to csv
today = datetime.date.today()
d1 = today.strftime("%d%m%Y")
df.to_csv(f'/content/drive/MyDrive/google_news_{d1}.csv')
del news, publisher, urls, dates, tickers
del t_news, t_publisher, t_urls, t_dates, t_tickers
import bs4
from bs4 import BeautifulSoup
import datetime
ingest_google_news()
The code above is from the following link: https://medium.com/analytics-vidhya/google-scraping-using-beautifulsoup-d53746ef5a32

Have a look at the SelectorGadget Chrome extension to grab CSS selectors by clicking on the desired element in your browser. CSS selectors reference.
Code and example in the online IDE (extracts the title, link, snippet, date published, source, and stores to CSV):
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
ticker_list = ['AAPL.O', 'LVS', 'COTY.K','JPM', 'XOM', '005930.KS']
def get_news():
# iterate over each ticker
for news in ticker_list:
params = {
"q": news, # query
"hl": "en", # language
"gl": "us", # country to search from
"tbm": "nws", # google news filter
}
# store news data
news_data = []
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
# container with needed data
for result in soup.select('.WlydOe'):
title = result.select_one('.nDgy9d').text
link = result['href']
source = result.select_one('.CEMjEf span').text
snippet = result.select_one('.GI74Re').text
date_published = result.select_one('p.S1FAPd').text
print(f'{title}\n{link}\n{snippet}\n{date_published}\n{source}\n')
# append news data to list in dict() format to save it later via pandas
news_data.append({
'title': title,
'link': link,
'snippet': snippet,
'date_published': date_published,
'source': source
})
# create DataFrame from the list()
df = pd.DataFrame(news_data)
# save DataFrame to csv without default index column on the left side
df.to_csv('bs4_final.csv', index=False)
get_news()
----------
'''
EU plans to legislate for common phone charger despite ...
https://www.reuters.com/technology/eu-plans-legislate-common-phone-charger-despite-apple-grumbles-2021-09-21/
... a person familiar with the matter said - a move likely to affect iPhone
maker Apple (AAPL.O) more than its rivals.
2 hours ago
Reuters
Wall Street ends sharply lower in broad sell-off
https://www.reuters.com/business/wall-street-ends-sharply-lower-broad-sell-off-2021-09-20/
O), Amazon.com Inc (AMZN.O), Apple Inc (AAPL.O), Facebook Inc (FB. ... O)
were among the biggest drags on the index as well as the S&P 500.
18 hours ago
Reuters
... other results
'''
Alternatively, you can achieve the same thing by using Google News Results API from SerpApi. It's a paid API with a free plan.
The main difference in your example is that you don't need to make things that complicated, figure out why things just don't work as expected, and then maintain it over time. Instead, you only need to iterate over structured JSON and get what you want, fast.
Code to integrate (same process for saving to csv as with code above):
import os
from serpapi import GoogleSearch
ticker_list = ['AAPL.O', 'LVS', 'COTY.K','JPM', 'XOM', '005930.KS']
def get_news():
for news in ticker_list:
params = {
"engine": "google",
"q": news,
"gl": "us",
"tbm": "nws",
"api_key": os.getenv("API_KEY"), # API environment variable
}
search = GoogleSearch(params)
results = search.get_dict()
for news_result in results["news_results"]:
print(f"Title: {news_result['title']}\nLink: {news_result['link']}\n")
get_news()
-----------
'''
Title: EU plans to legislate for common phone charger despite ...
Link: https://www.reuters.com/technology/eu-plans-legislate-common-phone-charger-despite-apple-grumbles-2021-09-21/
Title: Wall Street ends sharply lower in broad sell-off
Link: https://www.reuters.com/business/wall-street-ends-sharply-lower-broad-sell-off-2021-09-20/
Title: S&P 500 down more than 2% as growth worries rise
Link: https://www.reuters.com/business/sp-500-down-more-than-2-growth-worries-rise-2021-09-20/
Title: Apple joins streaming elite, Netflix crosses milestone with ...
Link: https://www.reuters.com/technology/apple-joins-streaming-elite-netflix-crosses-milestone-with-emmy-wins-2021-09-20/
... other results
'''
Disclaimer, I work for SerpApi.

Formatting scraped data Python Beautifulsoup

I am trying to scrape from this URL https://www.spoonflower.com/en/shop?on=fabric design names, creator names, fabric types, prices as per fabric type
The good thing is they have public API endpoints which make the data extraction simple
But the problem is they have different URLs for design names and for pricing
i.e to collect names of design and the creator name I have to ping this URL https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
And for pricing per fabric type requesting this endpoint
https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
I am getting correct data but the problem I stumbled across some formatting issues.
What I am looking for is something like this.
Each design with its fabric type alongside its prices in a single row. Instead, I am getting this kind of output
It would be great if anyone here can guide me through this like how to get the expected_output_result I am looking for.
Below is my code:
import requests
from bs4 import BeautifulSoup
import json
import csv
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': 'https://www.spoonflower.com',
'Connection': 'keep-alive',
'Referer': 'https://www.spoonflower.com/',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
}
res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
fabric.append(("_".join(fab.upper().split())))
#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
scraped_items = []
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name'],
screeName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
try:
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
except:
test_swatch_meter = 'N/A'
try:
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
except:
fat_quarter_meter = 'N/A'
try:
meter = details_endpoint_response['data']['pricing']['METER']['price']
except:
meter = 'N/A'
scraped_items.append({
'designName': designName,
'screenName': screeName,
'fabric_name': fabric_name,
'test_swatch_meter': test_swatch_meter,
'fat_quarter_meter': fat_quarter_meter,
'meter': meter
})
print(designName, screeName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
print(json.dumps(scraped_items, indent=2))
#print(type(details_endpoint))
#print(type(items_json['page_results'][0]))
with open('scraped_data.csv', 'w', newline='') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=scraped_items[0].keys())
writer.writeheader()
for row in scraped_items:
writer.writerow(row)
#print(fabric)

One way to do it is just reconfigure how you construct the output. Instead of a list, use a dictionary where designName, screenName, followed by the values. One thing to keep in mind is dictionaries don't allow duplicate keys, so had to number the column names, however you can remove those later if you'd like.
See if this gets what you are wanting:
import requests
from bs4 import BeautifulSoup
import json
import csv
import pandas as pd
from collections import OrderedDict
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': 'https://www.spoonflower.com',
'Connection': 'keep-alive',
'Referer': 'https://www.spoonflower.com/',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
}
res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
fabric.append(("_".join(fab.upper().split())))
#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
items_dict = OrderedDict()
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name']
screenName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
try:
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
except:
test_swatch_meter = 'N/A'
try:
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
except:
fat_quarter_meter = 'N/A'
try:
meter = details_endpoint_response['data']['pricing']['METER']['price']
except:
meter = 'N/A'
if (designName, screenName) not in items_dict.keys():
items_dict[(designName, screenName)] = {}
itemCount = len(items_dict[(designName, screenName)].values()) / 4
items_dict[(designName, screenName)].update({'fabric_name_%02d' %itemCount: fabric_name,
'test_swatch_meter_%02d' %itemCount: test_swatch_meter,
'fat_quarter_meter_%02d' %itemCount: fat_quarter_meter,
'meter_%02d' %itemCount: meter})
print(designName, screenName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
df = pd.DataFrame.from_dict(items_dict, orient='index').reset_index(drop=False)
df = df.rename(columns={'level_0':'designName','level_1':'screenName'})
df.to_csv('scraped_data.csv', index=False)

Error while web-scraping using BeautifulSoup

I am gathering housing data from zillow's website.So far I have gathered data from the first webpage.For my next step, I am trying to find links to the next button, which will navigate me to page 2, page 3, and so on. I used the Inspect feature of Chrome to locate the 'next button' button, which has the following structure
Next
I then used Beautiful Soup’s find_all method and filter on tag “a” and class “on”.I used the following code to extract all the links
driver = webdriver.Chrome(chromedriver)
zillow_bellevue_1="https://www.zillow.com/homes/Bellevue-WA-98004_rb/"
driver.get(zillow_bellevue_1)
soup = BeautifulSoup(driver.page_source,'html.parser')
next_button = soup.find_all("a", class_="on")
print(next_button)
I am not getting any output.Any inputs on where I am going wrong?

The class for the next button appears to be off not on, as such you could scrape details of each property and advance through all the pages as follows. It uses the requests library to get the HTML which should be faster than using a chrome driver.
from bs4 import BeautifulSoup
import requests
base_url = "https://www.zillow.com"
url = base_url + "/homes/Bellevue-WA-98004_rb/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
while url:
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
print('\n' + url)
for div in soup.find_all('div', class_="zsg-photo-card-caption"):
print(" {}".format(list(div.stripped_strings)))
next_button = soup.find("a", class_="off", href=True)
url = base_url + next_button['href'] if next_button else None
This continues requesting URLs until no next button is found. The output would be of the form:
https://www.zillow.com/homes/Bellevue-WA-98004_rb/
['New Construction', '$2,224,995+', '5 bds', '·', '4 ba', '·', '3,796+ sqft', 'The Castille Plan, Verano', 'D.R. Horton - Seattle']
['12 Central Square', '2', '$2,550+', '10290 NE 12th St, Bellevue, WA']
['Apartment For Rent', '$1,800/mo', '1 bd', '·', '1 ba', '·', '812 sqft', '10423 NE 32nd Pl APT E105, Bellevue, WA']
['House For Sale', '$1,898,000', '5 bds', '·', '4 ba', '·', '4,030 sqft', '3230 108th Ave SE, Bellevue, WA', 'Quorum Real Estate/Madison Inc']
['New Construction', '-- bds', '·', '-- ba', '·', '-- sqft', 'Coming Soon Plan, Northtowne', 'D.R. Horton - Seattle']
['The Meyden', '0', '$1,661+', '1', '$2,052+', '2', '$3,240+', '10333 Main St, Bellevue, WA']

I think it will be easier if you are using soup.findAll
my solution goes this way:
zillow_url = URL
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
response = requests.get(zillow_url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
prices = ["$" + re.sub(r'(\s\d)|(\W)|([a-z]+)', "", div.text.split("/")[0], ) for div in
soup.find_all('div', class_='list-card-price')]
# print(prices)
addresses = [div.text for div in
soup.findAll('address', class_='list-card-addr')]
urls = [x.get('href') if 'http' in x.get('href') else 'https://www.zillow.com' + x.get('href') for x in soup.find_all("a", class_="list-card-link list-card-link-top-margin list-card-img")]

How to return only today and yesterday's information that published using POST requests

I need to get the information that published today and a day before. Also when importing it to a csv file it only print the first column not the remained ones.
The URL: https://e-mehkeme.gov.az/Public/Cases
The dates stored in html as <td style="width:95px;text-align:center">28.10.2019</td>
import requests, re
from bs4 import BeautifulSoup as bs
import csv
request_headers = {
'authority': 'e-mehkeme.gov.az',
'method': 'POST',
'path': '/Public/Cases',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en,en-GB;q=0.9',
'cache-control': 'max-age=0',
'content-length': '66',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://e-mehkeme.gov.az',
'referer': 'https://e-mehkeme.gov.az/Public/Cases',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/75.0.3770.142 Safari/537.36',
}
voens = {'3100608381',
}
form_data = {
'CourtId': '',
'CaseNo': '',
'DocFin': '',
'DocSeries': '',
'DocNumber': '',
'VOEN': voens,
'button': 'Search',
}
url = 'https://e-mehkeme.gov.az/Public/Cases?courtid='
response = requests.post(url, data=form_data, headers=request_headers)
s = bs(response.content, 'lxml')
# PRINT THE CONTENTS OF EACH SEARCH!
for voen in voens:
form_data['VOEN'] = voen
r = requests.post('https://e-mehkeme.gov.az/Public/Cases', data=form_data)
soup = bs(r.text, 'lxml')
ids = [i['value'] for i in soup.select('.casedetail')]
for i in ids:
r = requests.get(f'https://e-mehkeme.gov.az/Public/CaseDetail?caseId={i}')
soup = bs(r.content, 'lxml')
output = [re.sub('\s+', ' ', i.text.strip()) for i in soup.select('[colspan="4"]')]
print(output)
with open('courtSearch.csv', 'w', newline='', encoding='utf-8') as myfile:
writer = csv.writer(myfile, quoting=csv.QUOTE_ALL)
writer.writerow(output)
DESIRED OUTPUT:

The following uses a slightly different url construct so you can use a GET request and easily gather all pages of results per voen. I gather the string dates and caseIds (required for later requests) during each request. I then use a mask (for days of interest e.g. today and yesterday, converted to strings of same format as on website) to filter for only the ids within desired date range. I then loop that filtered list and issue requests for the pop-up window info.
Within the code you can also see commented out sections. One of which shows you the results retrieved from each page
#print(pd.read_html(str(soup.select_one('#Cases')))[0]) ##view table
I am splitting on the header phrases (so assuming these are regular) such that I can split each string from row into the appropriate output columns.
Possiby requires bs4 4.7.1 +
import requests,re, csv
from bs4 import BeautifulSoup as bs
from datetime import datetime, timedelta
import pandas as pd
headers = ['Ətraflı məlumat: ', 'Cavabdeh: ', 'İddiaçı: ', 'İşin mahiyyəti ']
voens = ['2002283071','1303450301', '1700393071']
number_of_past_days_plus_today = 2
mask = [datetime.strftime(datetime.now() - timedelta(day_no), '%d.%m.%Y') for day_no in range(0, number_of_past_days_plus_today)]
ids = []
table_dates = []
with requests.Session() as s:
for voen in voens:
#print(voen) ##view voen
page = 1
while True:
r = s.get(f'https://e-mehkeme.gov.az/Public/Cases?page={page}&voen={voen}') #to get all pages of results
soup = bs(r.text, 'lxml')
ids.extend([i['value'] for i in soup.select('.casedetail')])
#print(pd.read_html(str(soup.select_one('#Cases')))[0]) ##view table
table_dates.extend([i.text.strip() for i in soup.select('#Cases td:nth-child(2):not([colspan])')])
if soup.select_one('[rel=next]') is None:
break
page+=1
pairs = list(zip(table_dates,ids))
filtered = [i for i in pairs if i[0] in mask]
#print(100*'-') ##spacing
#print(filtered) ##view final filtered list of ids
results = []
for j in filtered:
r = s.get(f'https://e-mehkeme.gov.az/Public/CaseDetail?caseId={j[1]}')
soup = bs(r.content, 'lxml')
line = ' '.join([re.sub('\s+',' ',i.text.strip()) for i in soup.select('[colspan="4"]')])
row = re.split('|'.join(headers),line)
results.append(row[1:])
with open("results.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ",", quoting=csv.QUOTE_MINIMAL)
w.writerow(headers)
for row in results:
w.writerow(row)
I searched for splitting on multiple delimiters and used the idea given by #Jonathan here. So upvoted for credit to that user.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

webscraping script for booking.com doesn't work - python

Related

How to select and scrape specific texts out of a bunch <ul> and <li>?

Not able to extract data or find problem with code

Formatting scraped data Python Beautifulsoup

Error while web-scraping using BeautifulSoup

How to return only today and yesterday's information that published using POST requests

Categories

Resources