I need to register the saving of values in. csv, but the number of values changes in each product, I can not understand how to do it correctly, so that each value is recorded under its own parameter, as shown in the file, please tell me
I will also attach a file to make it easier to understand what I need
from bs4 import BeautifulSoup
import requests
import time
HOST = 'https://samara.vseinstrumenti.ru'
URL = 'https://samara.vseinstrumenti.ru/santehnika/vse-dlya-vodosnabzheniya/avtonomnaya-kanalizatsiya/'
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_url(html):
soup = BeautifulSoup(html, 'html.parser')
urls = soup.find_all('div',class_='product-tile grid-item')
for item in urls:
time.sleep(5)
data_collection(HOST + item.find(class_='title').find('a').get('href'))
def get_name(html):
soup = BeautifulSoup(html, 'html.parser')
name = soup.find('h1',class_='title').text
return name
def get_description(html):
soup = BeautifulSoup(html, 'html.parser')
description = soup.find('div',itemprop="description").text
return description
def get_specifications_parameter(html):
soup = BeautifulSoup(html, 'html.parser')
dotted_list = soup.find('ul',class_='dotted-list')
parameters = dotted_list.find_all('span',class_='text')
return parameters
def get_specifications_meaning(html):
soup = BeautifulSoup(html, 'html.parser')
dotted_list = soup.find('ul',class_='dotted-list')
meaning = dotted_list.find_all('span',class_='value')
return meaning
def get_photo(html):
soup = BeautifulSoup(html, 'html.parser')
photo = soup.find('div',class_="item -active").find('img').get('src')
return photo
def get_price(html):
soup = BeautifulSoup(html, 'html.parser')
price = soup.find('span',class_='current-price').text
return price
def data_collection(URL):
html = get_html(URL)
name = get_name(html.text)
description = get_description(html.text)
specifications_parameter = get_specifications_parameter(html.text)
meaning = get_specifications_meaning(html.text)
# photo = get_photo(html.text)
price = get_price(html.text)
def start():
html = get_html(URL)
if html.status_code == 200:
get_url(html.text)
else:
print('Network error')
start()
I tried to do this, but it doesn't work like this
def save_file_walid(items, path):
with open(path, 'w', newline='') as file:
writer = csv.writer(file, delimiter=';')
for item in items:
writer.writerow(item)
https://drive.google.com/file/d/1uGoW1kpsDGDA-Zh7SiiCDcg9cf2lHQUd/view?usp=sharing
I'd like to know more about what's really happening.
First of all, the source path is correctly named? I mean, for example, a correct path name would be:
"/root/source path/content.csv"
with the file's name inside the path.
Looking at your code, at the end of the data_collection function you can add:
data = [name, description, specifications_parameter, meaning, price]
save_file_walid(data, path)
To store the data into the csv file. Then, in your save_file_walid() you don't need to use the for loop if you write only the data list. You just need:
def save_file_walid(item, path):
with open(path, 'w', newline='') as file:
writer = csv.writer(file, delimiter=';')
writer.writerow(item)
Finally, before storing the data you can add only once somewhere in your code:
data = ["name", "description", "specifications_parameter", "meaning", "price"]
save_file_walid(data, path)
to create the file (if there's not been created yet) with the name of each column.
Hope this gonna be helpfully to you)
Related
In this URL https://doc8643.com/aircrafts I want to scrape all rows.
Then for each individual row, for example https://doc8643.com/aircraft/A139
I want to scrape these three areas of data
<table class="table centered-table">
<h4>Manufacturers</h4>
<h4>Technical Data</h4>
Can this is done in python?
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://doc8643.com/aircrafts'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
writer.writerow([c.text if c.text else '' for c in row.select('h3')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
else:
break
You should create function which get value c.text (ie, A139) and creates full url like https://doc8643.com/aircraft/A139 and runs Request or requests and BeautifulSoup to get all needs data
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# ... scrape details and put in list `results` ...
return results
and run it in your loop
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
The biggest problem is to scrape details.
For some details it needs to scrape dl and next all dt and dd and use zip() to group in pairs.
Something like
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
but this need more code - and I skip this part.
Minimal working code
EDIT: I added url = 'https://doc8643.com' + url
import csv
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# --- functions ---
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
# --- main ---
url = 'https://doc8643.com/aircrafts'
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["data1", "data2", "data3", "etc..."])
while True:
print('url:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
url = 'https://doc8643.com' + url
else:
break
BTW:
Maybe it would be better to keep results as dictionary
results[dt.string] = [dd.string]
I'm just learning python. I want to improve myself with examples. sorry for my English. I'm in the process of learning a new language. :)
The program pulls data from an e-commerce site.
when I want to save it as a csv file, each new data overwrites the previous data. I tried several examples but it didn't work.
Thanks for your help.
import requests
import gettext
from bs4 import BeautifulSoup
import pandas as pd
import openpyxl as xls
import xlsxwriter`
baseurl = "https://www.trendyol.com"
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41'
}
for x in range(1,62):
r = requests.get(f'https://www.trendyol.com/cep-telefonu-x-c103498?pi={x}', headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('div', class_='p-card-wrppr')
for item in productlist:
productname = item.find('span', class_='prdct-desc-cntnr-name').getText()
productprice_old = item.find('div', class_='prc-box-sllng').getText()
productprice_discount = item.find('div', class_='prc-box-dscntd')
for productlink in item.find_all('a'):
productlink = baseurl+productlink.get('href')
if productprice_discount == None:
productprice_discount = productprice_old
else:
productprice_discount = productprice_discount.getText()
for merchant_name in productlink:
r = requests.get(productlink, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
merchant_name = soup.find('a', class_='merchant-text')
if merchant_name == None:
merchant_name = soup.find('a', class_='title')
if merchant_name == None:
merchant_name = soup.find('span', class_='product-description-market-place')
if merchant_name == None:
merchant_name = ('NULL')
else:
merchant_name = merchant_name.getText()
break
for product_image in productlink:
r = requests.get(productlink, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
product_image = soup.find_all('img', attrs={'class':'detail-section-img'})
image_src = [x['src'] for x in product_image]
image_src = [x for x in image_src if x.endswith('.jpg' or '.png')]
break
data = [ [productname,productlink,productprice_old,productprice_discount,merchant_name,image_src] ]
df = pd.DataFrame(data, columns = ["Product Name", "URL", "Price", "D-Price", "Store", "Image Url"])
df.to_csv('trendyol3.csv')
You should add mode='a', which means append to append to file instead of rewriting:
df.to_csv('trendyol3.csv', mode='a')
from bs4 import BeautifulSoup
import requests
import csv
class Parse():
def __init__(self):
self.row_list = []
self.base_url ='https://www.tripadvisor.co.uk'
def parse(self,url): # correct
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'}
response = requests.get(url,headers).text
soup = BeautifulSoup(response,'html.parser')
next_link = soup.find('a',class_='_23XJjgWS _1hF7hP_9 _2QvUxWyA')
next_page = self.base_url+next_link.attrs['href']
cards = soup.find_all('section',class_='_2TabEHya _3YhIe-Un')
for card in cards:
name = card.find('div',class_='_1gpq3zsA _1zP41Z7X').text
rating = str(card.find('svg',class_='zWXXYhVR'))
rating = self.remove(filter_col=rating)
review_count = card.find('span',class_='DrjyGw-P _26S7gyB4 _14_buatE _1dimhEoy').text
status = card.find('div',class_='DrjyGw-P _26S7gyB4 _3SccQt-T').text
row_list = [name,rating,status,review_count]
return next_page,row_list
def remove(self,filter_col):
rating = filter_col.split(' ')[1]
rating = rating[-3:]
return rating
def write_csv(self,row_list):
with open('top_sites.csv','w') as file:
csv_writer = csv.writer(file, delimiter=',')
csv_writer.writerows(row_list)
if __name__=='__main__':
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html"
parsing = Parse()
next_url,row_list = parsing.parse(url=url)
print(next_url)
PS C:\Users\Caspe\PycharmProjects\Selenium Test> & "c:/Users/Caspe/PycharmProjects/Selenium Test/.venv/Scripts/python.exe" "c:/Users/Caspe/PycharmProjects/Selenium Test/Demo/tripadvisor_topattract.py"
https://www.tripadvisor.co.uk/Attractions-g294190-Activities-Myanmar.html
PS C:\Users\Caspe\PycharmProjects\Selenium Test>
I'm trying to scrape data from TripAdvisor Website using BeautifulSoup.
Link: https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html
Instead of going to next page, the link is repeated itself. Is there a solution for my problem?
I've selected the correct selector for the soup and I was able to scrape data.
To get pagination working, it's necessary to change the -oa<index>- part in URL:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa{}-Myanmar.html"
data = []
for page in range(0, 4): # <--- increase page count here
print("Getting page {}..".format(page))
soup = BeautifulSoup(
requests.get(url.format(page * 30)).content, "html.parser"
)
titles = soup.select('span[name="title"]')
for title in titles:
no, t = title.get_text(strip=True, separator="|").split("|")
rating = title.find_next("svg")
review_count = rating.find_next("span")
data.append(
(
no,
t,
rating["title"],
review_count.text,
review_count.find_next(
"div", class_="DrjyGw-P _26S7gyB4 _3SccQt-T"
).text,
)
)
with open("data.csv", "w") as f_out:
w = csv.writer(f_out)
w.writerows(data)
Writes data.csv (screenshot from LibreOffice):
I am trying to parse data from all pages. Parsing ends after the first page. What could be the problem?
I use pagination with the use of a regular expression.
The first page of the site and others differ in the html code, so I have to create two different functions main_1 and main_2 for the first and other pages.
If you try to run only the main_2 function, nothing will work. .CSV file will not be created.
help me please.
import requests
from bs4 import BeautifulSoup
import csv
import re
def get_html(url):
r = requests.get(url)
if r.ok:
return r.text
print(r.status_code)
def writer_csv(data):
with open('tesr.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow((data['name'], data['url'], data['price']))
def get_data_page(html):
soup = BeautifulSoup(html, 'lxml')
trs = soup.find_all('tr', class_='cmc-table-row')
for tr in trs:
tds = tr.find_all('td')
try:
name = tds[1].find('a', class_='cmc-link').text.strip()
except:
name = ''
try:
url = 'https://coinmarketcap.com' + str(tds[1].find('a', class_='cmc-link').get('href'))
except:
url = ''
try:
price = tr.find('td', class_='cmc-table__cell--sort-by__price').find('a').text.strip().replace('$', '')
except:
price = ''
data = {'name': name,
'url': url,
'price': price}
writer_csv(data)
def main_1():
url_1 = 'https://coinmarketcap.com/'
get_data_page(get_html(url_1))
def main_2():
url_2 = 'https://coinmarketcap.com/2/'
while True:
get_data_page(get_html(url_2))
soup = BeautifulSoup(get_html(url_2), 'lxml')
try:
pattern = 'Next '
url_2 = 'https://coinmarketcap.com' + str(soup.find('ul', class_='pagination').find('a', text=re.compile(pattern)).get('href'))
except:
break
main_1()
main_2()
import requests
from bs4 import BeautifulSoup
import json
import re
url = "https://www.daraz.pk/catalog/?q=dell&_keyori=ss&from=input&spm=a2a0e.searchlist.search.go.57446b5079XMO8"
page = requests.get(url)
print(page.status_code)
print(page.text)
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.prettify())
alpha = soup.find_all('script',{'type':'application/ld+json'})
jsonObj = json.loads(alpha[1].text)
Below is the code to Find All the relevant product information from json object
for item in jsonObj['itemListElement']:
name = item['name']
price = item['offers']['price']
currency = item['offers']['priceCurrency']
availability = item['offers']['availability'].split('/')[-1]
availability = [s for s in re.split("([A-Z][^A-Z]*)", availability) if s]
availability = ' '.join(availability)
Here is the code to extract URL for json script
url = item['url']
print('Availability: %s Price: %0.2f %s Name: %s' %(availability,float(price), currency,name, url))
Below is the code to extract data inro csv:
outfile = open('products.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["name", "type", "price", "priceCurrency", "availability" ])
alpha = soup.find_all('script',{'type':'application/ld+json'})
jsonObj = json.loads(alpha[1].text)
for item in jsonObj['itemListElement']:
name = item['name']
type = item['#type']
url = item['url']
price = item['offers']['price']
currency = item['offers']['priceCurrency']
availability = item['offers']['availability'].split('/')[-1]
The File creates the Header but no data in CSV for the URL
writer.writerow([name, type, price, currency, availability, URL ])
outfile.close()
first, you don't include the header there. not a big deal, just the first row would have a blank for your header in the url column. So to include that:
writer.writerow(["name", "type", "price", "priceCurrency", "availability", "url" ])
Second, you store the string as url, but then reference URL in your writer. URL isn't holding any value. In fact, it should have given an error of URL is not defined or something similar.
And since you already use url in your code with url = "https://www.daraz.pk/catalog/?q=dell&_keyori=ss&from=input&spm=a2a0e.searchlist.search.go.57446b5079XMO8", I would also probably change the variable name to something like url_text.
I'd probably also use variable type_text or something other than type, since type is a built-in function in python.
But you need to change to:
writer.writerow([name, type, price, currency, availability, url ])
outfile.close()
Full code:
import requests
from bs4 import BeautifulSoup
import json
import csv
url = "https://www.daraz.pk/catalog/?q=dell&_keyori=ss&from=input&spm=a2a0e.searchlist.search.go.57446b5079XMO8"
page = requests.get(url)
print(page.status_code)
print(page.text)
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.prettify())
alpha = soup.find_all('script',{'type':'application/ld+json'})
jsonObj = json.loads(alpha[1].text)
outfile = open('c:\products.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["name", "type", "price", "priceCurrency", "availability" , "url"])
for item in jsonObj['itemListElement']:
name = item['name']
type_text = item['#type']
url_text = item['url']
price = item['offers']['price']
currency = item['offers']['priceCurrency']
availability = item['offers']['availability'].split('/')[-1]
writer.writerow([name, type_text, price, currency, availability, url_text ])
outfile.close()
The only thing I could find wrong is that you have a typo in the last line - upper-case URL instead of lower-case url. Changing it made the script work perfectly.