How to scrape data from website when It display false response - python

I want to scrape data from a website but I'm getting one error. As I'm new to web scraping so plese guide me how to fix this issue. here is the issue that I am facing UnboundLocalError: local variable 'soup' referenced before assignment
here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_page(soup):
try:
title = (soup.find('h1',class_="ProductTitle-title",id=False).text)
except:
title = 'Empty Title'
try:
title = (soup.find('h1',class_="TopBar-perUnit TopBar-perUnitTop",id=False).text)
except:
price = 'Empty price'
try:
img = (soup.find('img',class_="ViewSelectorItem-image",id=False).get('src'))
except:
img = 'Empty img'
data = {
'Title' : title,
'Price' : price,
'Img' : img
}
print(data)
def main():
url = "https://www.zazzle.com/60th_silver_diamond_anniversary_photo_invitations-161837951427094549"
get_detail_page(get_page(url))
if __name__ == '__main__':
main()

I've added the user-agent to your code:
import urllib.request as urllib2
from bs4 import BeautifulSoup
import csv
REQUEST_HEADER = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
def get_page(url):
req = urllib2.Request(url, headers=REQUEST_HEADER)
page = urllib2.urlopen(req, timeout=20).read()
soup = BeautifulSoup(page, "html.parser")
return soup
def get_detail_page(soup):
try:
title = (soup.find('h1',class_="ProductTitle-title",id=False).text)
except:
title = 'Empty Title'
try:
title = (soup.find('h1',class_="TopBar-perUnit TopBar-perUnitTop",id=False).text)
except:
price = 'Empty price'
try:
img = (soup.find('img',class_="ViewSelectorItem-image",id=False).get('src'))
except:
img = 'Empty img'
data = {
'Title' : title,
'Price' : price,
'Img' : img
}
print(data)
def main():
url = "https://www.zazzle.com/60th_silver_diamond_anniversary_photo_invitations-161837951427094549"
get_detail_page(get_page(url))
if __name__ == '__main__':
main()
Also, a pretty interesting read: Google Chrome: Change the User-Agent String

Related

Multiple values against the same tag not scraping

I'm getting no values for my "Number of Rooms" and "Room" search.
https://www.zoopla.co.uk/property/uprn/906032139/
I can see here that I should be returning something but not getting anything.
Can anyone possibly point me in the right direction of how to solve this? I am not even sure what to search for as it's not erroring. I thought it would put all the data in and then I would need to figure out a way to seperate it. Do I need to maybe scrape it into a dictionary?
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import matplotlib as plt
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://google.co.uk",
"DNT": "1"
}
page = 1
addresses = []
while page != 2:
url = f"https://www.zoopla.co.uk/house-prices/edinburgh/?pn={page}"
print(url)
response = requests.get(url, headers=headers)
print(response)
html = response.content
soup = bs(html, "lxml")
time.sleep(1)
for address in soup.find_all("div", class_="c-rgUPM c-rgUPM-pnwXf-hasUprn-true"):
details = {}
# Getting the address
details["Address"] = address.h2.get_text(strip=True)
# Getting each addresses unique URL
scotland_house_url = f'https://www.zoopla.co.uk{address.find("a")["href"]}'
details["URL"] = scotland_house_url
scotland_house_url_response = requests.get(
scotland_house_url, headers=headers)
scotland_house_soup = bs(scotland_house_url_response.content, "lxml")
# Lists status of the property
try:
details["Status"] = [status.get_text(strip=True) for status in scotland_house_soup.find_all(
"span", class_="css-10o3xac-Tag e164ranr11")]
except AttributeError:
details["Status"] = ""
# Lists the date of the status of the property
try:
details["Status Date"] = [status_date.get_text(
strip=True) for status_date in scotland_house_soup.find_all("p", class_="css-1jq4rzj e164ranr10")]
except AttributeError:
details["Status Date"] = ""
# Lists the value of the property
try:
details["Value"] = [value.get_text(strip=True).replace(",", "").replace(
"£", "") for value in scotland_house_soup.find_all("p", class_="css-1x01gac-Text eczcs4p0")]
except AttributeError:
details["Value"] = ""
# Lists the number of rooms
try:
details["Number of Rooms"] = [number_of_rooms.get_text(strip=True) for number_of_rooms in scotland_house_soup.find_all(
"p", class_="css-82kmy1 e13gx5i3")]
except AttributeError:
details["Number of Rooms"] = ""
# Lists type of room
try:
details["Room"] = [room.get_text(strip=True) for room in scotland_house_soup.find_all(
"span", class_="css-1avcdf2 e13gx5i4")]
except AttributeError:
details["Room"] = ""
addresses.append(details)
page = page + 1
for address in addresses[:]:
print(address)
print(response)
Selecting by class_="css-1avcdf2 e13gx5i4" seems brittle, the class might change all the time. Try different CSS selector:
import requests
from bs4 import BeautifulSoup
url = "https://www.zoopla.co.uk/property/uprn/906032139/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
tag = soup.select_one('#timeline p:has(svg[data-testid="bed"]) + p')
no_beds, beds = tag.get_text(strip=True, separator=" ").split()
print(no_beds, beds)
Prints:
1 bed
If you want all types of rooms:
for detail in soup.select("#timeline p:has(svg[data-testid]) + p"):
n, type_ = detail.get_text(strip=True, separator="|").split("|")
print(n, type_)
Prints:
1 bed
1 bath
1 reception

How do I write data from python line by line to csv file?

I'm just learning python. I want to improve myself with examples. sorry for my English. I'm in the process of learning a new language. :)
The program pulls data from an e-commerce site.
when I want to save it as a csv file, each new data overwrites the previous data. I tried several examples but it didn't work.
Thanks for your help.
import requests
import gettext
from bs4 import BeautifulSoup
import pandas as pd
import openpyxl as xls
import xlsxwriter`
baseurl = "https://www.trendyol.com"
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41'
}
for x in range(1,62):
r = requests.get(f'https://www.trendyol.com/cep-telefonu-x-c103498?pi={x}', headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('div', class_='p-card-wrppr')
for item in productlist:
productname = item.find('span', class_='prdct-desc-cntnr-name').getText()
productprice_old = item.find('div', class_='prc-box-sllng').getText()
productprice_discount = item.find('div', class_='prc-box-dscntd')
for productlink in item.find_all('a'):
productlink = baseurl+productlink.get('href')
if productprice_discount == None:
productprice_discount = productprice_old
else:
productprice_discount = productprice_discount.getText()
for merchant_name in productlink:
r = requests.get(productlink, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
merchant_name = soup.find('a', class_='merchant-text')
if merchant_name == None:
merchant_name = soup.find('a', class_='title')
if merchant_name == None:
merchant_name = soup.find('span', class_='product-description-market-place')
if merchant_name == None:
merchant_name = ('NULL')
else:
merchant_name = merchant_name.getText()
break
for product_image in productlink:
r = requests.get(productlink, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
product_image = soup.find_all('img', attrs={'class':'detail-section-img'})
image_src = [x['src'] for x in product_image]
image_src = [x for x in image_src if x.endswith('.jpg' or '.png')]
break
data = [ [productname,productlink,productprice_old,productprice_discount,merchant_name,image_src] ]
df = pd.DataFrame(data, columns = ["Product Name", "URL", "Price", "D-Price", "Store", "Image Url"])
df.to_csv('trendyol3.csv')
You should add mode='a', which means append to append to file instead of rewriting:
df.to_csv('trendyol3.csv', mode='a')

Failed to parse content from a webpage using requests

I'm trying to create a script using requests module (without using session) to parse two fields from a webpage but the script fails miserably. However, when I created another script using session, I could fetch the content from that site flawlessly.
Here goes the manual steps to reach the content:
Choose the first item from dropdown.
Get the links to the detail page.
Grab these two fields from detail page.
While creating the script using plain requests, I tried to make use of cookies but I ended up getting AttributeError.
Script without session:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def grab_first_link_from_dropdown(link):
r = requests.get(link,headers=headers)
soup = BeautifulSoup(r.text,"html.parser")
category_link = urljoin(base,soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link,headers=headers)
str_cookie = f"JSESSIONID={res.cookies['JSESSIONID']}"
soup = BeautifulSoup(res.text,"html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'",target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield str_cookie,inner_link
def get_content(str_cookie,inner_link):
headers['Cookie'] = str_cookie
res = requests.get(inner_link,headers=headers)
soup = BeautifulSoup(res.text,"html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: descripcion = ""
return expediente,descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie,detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie,detail_page_link))
What possible change should I bring about to make the script work?
There's a redirect that occurs on fetch_detail_page_link. Python Requests follows redirects by default. When your script obtains the cookies, it is only grabbing the cookies for the final request in the chain. You must access the history field of the response to see the redirects that were followed. Doing this with a Session object worked because it was preserving those cookies for you.
I must agree with others who have commented that it really would be a good idea to use a Session object for this. However if you insist on not using Session, your script would look like this:
import re
import requests
from requests.cookies import RequestsCookieJar
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': "Scraping Your Vigentes 1.0",
}
def grab_first_link_from_dropdown(link):
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
category_link = urljoin(base, soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link, headers=headers)
cookies = RequestsCookieJar() # create empty cookie jar
for r in res.history:
cookies.update(r.cookies) # merge in cookies from each redirect response
cookies.update(res.cookies) # merge in cookies from the final response
soup = BeautifulSoup(res.text, "html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'", target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield cookies, inner_link
def get_content(cookies, inner_link):
res = requests.get(inner_link, headers=headers, cookies=cookies)
if not res.ok:
print("Got bad response %s :(" % res.status_code)
return "", ""
soup = BeautifulSoup(res.text, "html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
descripcion = ""
return expediente, descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie, detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie, detail_page_link))

Creating a dataframe from a dictionary is giving me a could not broadcast error

I am trying to create a data frame from a dictionary I have and it gives me an error that says:
> ValueError: could not broadcast input array from shape (3) into shape
> (1)
Here is the code:
import requests
from bs4 import BeautifulSoup
from requests.api import request
from selenium import webdriver
from bs4 import Tag, NavigableString
baseurl = "https://www.olx.com.eg/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
product_links = []
for x in range(1,13):
r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers)
soup = BeautifulSoup(r.content, "lxml")
product_list = soup.findAll("div", class_="ads__item")
for item in product_list:
for link in item.findAll("a",href=True):
product_links.append(link['href'])
for thing in product_links:
if '#' in product_links: product_links.remove('#')
# test_link = 'https://www.olx.com.eg/ad/-IDcjqyP.html'
for link in product_links:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
job_title = soup.find('h1',class_="brkword")
job_location = soup.find('strong',class_="c2b")
job_date = soup.find('span',class_="pdingleft10 brlefte5")
try:
seniority = soup.find_all('td',class_='value')[0].text.strip()
except:
print("")
try:
full_or_part = soup.find_all('td',class_='value')[1].text.strip()
except:
print("")
try:
education_level = soup.find_all('td',class_='value')[2].text.strip()
except:
print("")
try:
sector = soup.find_all('td',class_='value')[3].text.strip()
except:
print("")
description = soup.find_all('p',class_='pding10')
df = {
"Job Title" : job_title,
"Job Location" : job_location,
"Post Date" : job_date,
"Seniority Level" : seniority,
"Full or Part time" : full_or_part,
"Educational Level" : education_level,
"Sector" : sector,
"Job Description" : description
}
job_data = pd.DataFrame(df)
Please tell me how I can transform the data I have into a data frame so I can export it into a csv
first of all I was trying to to scrape this jobs website and it scraped it successfully returning 500 jobs in the dictionary but I was unfortunately not able to transform the code into a dataframe, so later on i can export that out to a csv file, so i can do some analysis on it
To create dataframe from the job ads, you can try next example (some column names needs to be renamed from arabic to english though):
import requests
import pandas as pd
from bs4 import BeautifulSoup
baseurl = "https://www.olx.com.eg/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
product_links = []
for x in range(1, 2): # <-- increase the range here
r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers)
soup = BeautifulSoup(r.content, "lxml")
product_list = soup.findAll("div", class_="ads__item")
for item in product_list:
for link in item.findAll("a", href=True):
if link["href"] != "#":
product_links.append(link["href"])
all_data = []
for link in product_links:
print(f"Getting {link} ...")
soup = BeautifulSoup(requests.get(link, headers=headers).content, "lxml")
d = {}
job_title = soup.find("h1").get_text(strip=True)
job_location = soup.find("strong", class_="c2b")
job_date = soup.find("span", class_="pdingleft10 brlefte5")
d["title"] = job_title
d["location"] = job_location.get_text(strip=True) if job_location else "N/A"
d["date"] = job_date.get_text(strip=True) if job_date else "N/A"
for table in soup.select("table.item"):
d[table.th.get_text(strip=True)] = table.td.get_text(strip=True)
all_data.append(d)
job_data = pd.DataFrame(all_data)
print(job_data)
job_data.to_csv("data.csv", index=False)
Creates data.csv (screenshot from LibreOffice):

name 'get_pages_count' is not defined

I am writing a parser but I have a problem. I understand that you can find many similar questions on the Internet, but they did not suit me. Therefore, I ask for help from you.I have little experience, so this question may not be very correct.
Code:
import requests
from bs4 import BeautifulSoup
URL = 'https://stopgame.ru/topgames'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'accept': '*/*'}
HOST = 'https://stopgame.ru'
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].get.text())
else:
return 1
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="lent-brief")
games = []
for item in items:
games.append({
"title": item.find("div", class_="title lent-title").get_text(strip=True),
"date": item.find("div", class_="game-date").get_text(strip=True),
"ganre": item.find("div", class_="game-genre").get_text(strip=True),
})
print(games)
print(len(games))
return games
def parse():
html = get_html(URL)
if html.status_code == 200:
pages_count = get_pages_count(html.text)
print(pages_count)
else:
print('Error')
parse()
Error:
File "D:/Python/parser1.py", line 45, in parse
pages_count = get_pages_count(html.text)
NameError: name 'get_pages_count' is not defined
Your function is named get_pages, but you're calling get_pages_count:
def get_pages(html):
.. but when attempting to call it:
pages_count = get_pages_count(html.text)
.. the call should be:
pages_count = get_pages(html.text)
In this below function the method you have called is wrong.
Instead of this pagination[1].get.text() it should be pagination[1].get_text() or
pagination[1].text
Code:
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].get_text())
else:
return 1
OR
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].text)
else:
return 1

Categories

Resources