Pacing text after 'href' - python

I need to get text "Платонов А.П." Here's my code by far.
import requests
from bs4 import BeautifulSoup
from pip._internal.network.utils import HEADERS
URL = "https://www.moscowbooks.ru/books/?sortby=name&sortdown=false"
HEADERS = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", "accept" : "*/*"}
HOST = "https://www.moscowbooks.ru"
def get_html(url, params=None):
r = requests.get(url, headers = HEADERS, params = params)
return r
def get_content(html):
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all("div", class_ = "catalog__item")
books = []
for item in items:
author_check = item.find("a", class_="author-name")
if author_check:
author = author_check.get_text()
else:
author_check = "Автор не указан"
books.append({
"title": item.find("div", class_ = "book-preview__title").get_text(strip=True),
"author": author_check,
"link": HOST + item.find("a", class_ = "book-preview__title-link").get("href"),
"cost": item.find("div", class_="book-preview__price").get_text(strip=True),
})
print(books)
print(len(books))
def parse():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print("Error")
parse()
I get problems with author, because it get like this:
<a class="author-name" href="/catalog/author/21381/">Платонов А. П. </a>
Also need a little help with price because sometimes it gets '2\xa0274' instead of '2 274'.

The problem is that you define "author": author_check in your dictionary, while author_check = item.find("a", class_="author-name") and author = author_check.get_text(). You can change your for loop into something like this
for item in items:
author_check = item.find("a", class_="author-name")
if author_check:
author = author_check.text
else:
author = "Автор не указан"
For you issue with the display of the prices, you can just replace \xa0 with a comma or space.
"cost": item.find("div", class_="book-preview__price").get_text(strip=True).replace(u"\xa0", ",")

I've had to deal with similar problem. You can do the following:
author = author_check.get_text().split('>')[-2].split('<')[0]
You might have to substitute -2 with -1.

Related

Multiple values against the same tag not scraping

I'm getting no values for my "Number of Rooms" and "Room" search.
https://www.zoopla.co.uk/property/uprn/906032139/
I can see here that I should be returning something but not getting anything.
Can anyone possibly point me in the right direction of how to solve this? I am not even sure what to search for as it's not erroring. I thought it would put all the data in and then I would need to figure out a way to seperate it. Do I need to maybe scrape it into a dictionary?
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import matplotlib as plt
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://google.co.uk",
"DNT": "1"
}
page = 1
addresses = []
while page != 2:
url = f"https://www.zoopla.co.uk/house-prices/edinburgh/?pn={page}"
print(url)
response = requests.get(url, headers=headers)
print(response)
html = response.content
soup = bs(html, "lxml")
time.sleep(1)
for address in soup.find_all("div", class_="c-rgUPM c-rgUPM-pnwXf-hasUprn-true"):
details = {}
# Getting the address
details["Address"] = address.h2.get_text(strip=True)
# Getting each addresses unique URL
scotland_house_url = f'https://www.zoopla.co.uk{address.find("a")["href"]}'
details["URL"] = scotland_house_url
scotland_house_url_response = requests.get(
scotland_house_url, headers=headers)
scotland_house_soup = bs(scotland_house_url_response.content, "lxml")
# Lists status of the property
try:
details["Status"] = [status.get_text(strip=True) for status in scotland_house_soup.find_all(
"span", class_="css-10o3xac-Tag e164ranr11")]
except AttributeError:
details["Status"] = ""
# Lists the date of the status of the property
try:
details["Status Date"] = [status_date.get_text(
strip=True) for status_date in scotland_house_soup.find_all("p", class_="css-1jq4rzj e164ranr10")]
except AttributeError:
details["Status Date"] = ""
# Lists the value of the property
try:
details["Value"] = [value.get_text(strip=True).replace(",", "").replace(
"£", "") for value in scotland_house_soup.find_all("p", class_="css-1x01gac-Text eczcs4p0")]
except AttributeError:
details["Value"] = ""
# Lists the number of rooms
try:
details["Number of Rooms"] = [number_of_rooms.get_text(strip=True) for number_of_rooms in scotland_house_soup.find_all(
"p", class_="css-82kmy1 e13gx5i3")]
except AttributeError:
details["Number of Rooms"] = ""
# Lists type of room
try:
details["Room"] = [room.get_text(strip=True) for room in scotland_house_soup.find_all(
"span", class_="css-1avcdf2 e13gx5i4")]
except AttributeError:
details["Room"] = ""
addresses.append(details)
page = page + 1
for address in addresses[:]:
print(address)
print(response)
Selecting by class_="css-1avcdf2 e13gx5i4" seems brittle, the class might change all the time. Try different CSS selector:
import requests
from bs4 import BeautifulSoup
url = "https://www.zoopla.co.uk/property/uprn/906032139/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
tag = soup.select_one('#timeline p:has(svg[data-testid="bed"]) + p')
no_beds, beds = tag.get_text(strip=True, separator=" ").split()
print(no_beds, beds)
Prints:
1 bed
If you want all types of rooms:
for detail in soup.select("#timeline p:has(svg[data-testid]) + p"):
n, type_ = detail.get_text(strip=True, separator="|").split("|")
print(n, type_)
Prints:
1 bed
1 bath
1 reception

How do I write data from python line by line to csv file?

I'm just learning python. I want to improve myself with examples. sorry for my English. I'm in the process of learning a new language. :)
The program pulls data from an e-commerce site.
when I want to save it as a csv file, each new data overwrites the previous data. I tried several examples but it didn't work.
Thanks for your help.
import requests
import gettext
from bs4 import BeautifulSoup
import pandas as pd
import openpyxl as xls
import xlsxwriter`
baseurl = "https://www.trendyol.com"
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41'
}
for x in range(1,62):
r = requests.get(f'https://www.trendyol.com/cep-telefonu-x-c103498?pi={x}', headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('div', class_='p-card-wrppr')
for item in productlist:
productname = item.find('span', class_='prdct-desc-cntnr-name').getText()
productprice_old = item.find('div', class_='prc-box-sllng').getText()
productprice_discount = item.find('div', class_='prc-box-dscntd')
for productlink in item.find_all('a'):
productlink = baseurl+productlink.get('href')
if productprice_discount == None:
productprice_discount = productprice_old
else:
productprice_discount = productprice_discount.getText()
for merchant_name in productlink:
r = requests.get(productlink, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
merchant_name = soup.find('a', class_='merchant-text')
if merchant_name == None:
merchant_name = soup.find('a', class_='title')
if merchant_name == None:
merchant_name = soup.find('span', class_='product-description-market-place')
if merchant_name == None:
merchant_name = ('NULL')
else:
merchant_name = merchant_name.getText()
break
for product_image in productlink:
r = requests.get(productlink, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
product_image = soup.find_all('img', attrs={'class':'detail-section-img'})
image_src = [x['src'] for x in product_image]
image_src = [x for x in image_src if x.endswith('.jpg' or '.png')]
break
data = [ [productname,productlink,productprice_old,productprice_discount,merchant_name,image_src] ]
df = pd.DataFrame(data, columns = ["Product Name", "URL", "Price", "D-Price", "Store", "Image Url"])
df.to_csv('trendyol3.csv')
You should add mode='a', which means append to append to file instead of rewriting:
df.to_csv('trendyol3.csv', mode='a')

python/ beautifulsoup KeyError: 'href'

I am using bs4 to write a webscraper to obtain funding news data.
The first part of my code extracts the title, link, summary and date
of each article for n number of pages.
The second part of my code loops through the link column and inputs
the resulting url in a new function, which extracts the url of the
company in question.
For the most part, the code works fine (40 pages scraped without errors). I am trying to stress test it by raising it to 80 pages, but i'm running into KeyError: 'href' and I don't know how to fix this.
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm
def clean_data(column):
df[column]= df[column].str.encode('ascii', 'ignore').str.decode('ascii')
#extract
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
url = f'https://www.uktechnews.info/category/investment-round/series-a/page/{page}/'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
#transform
def transform(soup):
for item in soup.find_all('div', class_ = 'post-block-style'):
title = item.find('h3', {'class': 'post-title'}).text.replace('\n','')
link = item.find('a')['href']
summary = item.find('p').text
date = item.find('span', {'class': 'post-meta-date'}).text.replace('\n','')
news = {
'title': title,
'link': link,
'summary': summary,
'date': date
}
newslist.append(news)
return
newslist = []
#subpage
def extract_subpage(url):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
r = requests.get(url, headers)
soup_subpage = BeautifulSoup(r.text, 'html.parser')
return soup_subpage
def transform_subpage(soup_subpage):
main_data = soup_subpage.select("div.entry-content.clearfix > p > a")
if len(main_data):
subpage_link = {
'subpage_link': main_data[0]['href']
}
subpage.append(subpage_link)
else:
subpage_link = {
'subpage_link': '--'
}
subpage.append(subpage_link)
return
subpage = []
#load
page = np.arange(0, 80, 1).tolist()
for page in tqdm(page):
try:
c = extract(page)
transform(c)
except:
None
df1 = pd.DataFrame(newslist)
for url in tqdm(df1['link']):
t = extract_subpage(url)
transform_subpage(t)
df2 = pd.DataFrame(subpage)
Here is a screenshot of the error:
Screenshot
I think the issue is that my if statement for the transform_subpage function does not account for instances where main_data is not an empty list but does not contain href links. I am relatively new to Python so any help would be much appreciated!
You are correct, it's caused by main_data[0] not having an 'href' attribute at some point. You can try changing the logic to something like:
def transform_subpage(soup_subpage):
main_data = soup_subpage.select("div.entry-content.clearfix > p > a")
if len(main_data):
if 'href' in main_data[0].attrs:
subpage_link = {
'subpage_link': main_data[0]['href']
}
subpage.append(subpage_link)
else:
subpage_link = {
'subpage_link': '--'
}
subpage.append(subpage_link)
Also just a note, it's probably not a great idea to iterate through a variable list, and use the same variable name for each item in the list. So change to something like:
page_list = np.arange(0, 80, 1).tolist()
for page in tqdm(page_list):

how to scrape nested tag elements with python

hi i would like to get some data which is on below < del> or < ins> tags but i could not find any solution for it can anyone has idea about this scraping and is there any short way for getting those informations
this is my python code
import requests
import json
from bs4 import BeautifulSoup
header = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'}
base_url = "https://www.n11.com/super-firsatlar"
r = requests.get(base_url,headers=header)
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
books = soup.find_all('li',attrs={"class":"column"})
result=[]
for book in books:
title=book.find('h3').text
link=base_url +book.find('a')['href']
picture = base_url + book.find('img')['src']
price = soup.find('a',attrs={"class":"ins"})
single ={'title':title,'link':link,'picture':picture,'price':price}
result.append(single)
with open('book.json','w', encoding='utf-8') as f:
json.dump(result ,f,indent=4,ensure_ascii=False)
else:
print(r.status_code)
<div class="proDetail">
<a href="https://test.com"class="oldPrice" title="Premium">
<del>69,00 TL</del></a>
<a href="https://test.com"class="newPrice" title="Premium">
<ins>14,90</ins>
</a>
</div>
and this is my output
{
"title": "Premium",
"link": "https://test.com",
"picture": "https://pic.gif",
"price": null
},
You are searching for the wrong class. First search for the class 'newPrice' to get the a-block with:
price = book.find('a', attrs={'class': 'newPrice'})
Then you can search inside this a-block for the ins-block like:
price = book.find('a', attrs={'class': 'newPrice'}).find('ins')
Then your result would look like:
<ins>14,90</ins>
For final result strip the html tags:
price = book.find('a', attrs={'class': 'newPrice'}).find('ins').text.strip()

name 'get_pages_count' is not defined

I am writing a parser but I have a problem. I understand that you can find many similar questions on the Internet, but they did not suit me. Therefore, I ask for help from you.I have little experience, so this question may not be very correct.
Code:
import requests
from bs4 import BeautifulSoup
URL = 'https://stopgame.ru/topgames'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'accept': '*/*'}
HOST = 'https://stopgame.ru'
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].get.text())
else:
return 1
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="lent-brief")
games = []
for item in items:
games.append({
"title": item.find("div", class_="title lent-title").get_text(strip=True),
"date": item.find("div", class_="game-date").get_text(strip=True),
"ganre": item.find("div", class_="game-genre").get_text(strip=True),
})
print(games)
print(len(games))
return games
def parse():
html = get_html(URL)
if html.status_code == 200:
pages_count = get_pages_count(html.text)
print(pages_count)
else:
print('Error')
parse()
Error:
File "D:/Python/parser1.py", line 45, in parse
pages_count = get_pages_count(html.text)
NameError: name 'get_pages_count' is not defined
Your function is named get_pages, but you're calling get_pages_count:
def get_pages(html):
.. but when attempting to call it:
pages_count = get_pages_count(html.text)
.. the call should be:
pages_count = get_pages(html.text)
In this below function the method you have called is wrong.
Instead of this pagination[1].get.text() it should be pagination[1].get_text() or
pagination[1].text
Code:
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].get_text())
else:
return 1
OR
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].text)
else:
return 1

Categories

Resources