Write cleaned BS4 data to csv file - python

from selenium import webdriver
from bs4 import BeautifulSoup
import csv
chrome_path = r"C:\Users\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get('http://www.yell.com')
search = driver.find_element_by_id("search_keyword")
search.send_keys("plumbers")
place = driver.find_element_by_id("search_location")
place.send_keys("London")
driver.find_element_by_xpath("""//*[#id="searchBoxForm"]/fieldset/div[1]/div[3]/button""").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
for names in soup.find_all("span", {"class": "businessCapsule--name"}):
print(names.text)
Output = soup.find_all("span", {"class": "businessCapsule--name"})
with open('comple16.csv', 'w') as csv_file:
csv.register_dialect('custom', delimiter='\n', quoting=csv.QUOTE_NONE, escapechar='\\')
writer = csv.writer(csv_file, 'custom')
row = Output
writer.writerow(row)
Currently the code is producing this in the csv file = class": "businessCapsule-- (scraped text)
I would like to only print the scraped text into the CSV file (without the tags)
Please help.

from selenium import webdriver
from bs4 import BeautifulSoup`
import csv
chrome_path = r"C:\Users\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get('http://www.yell.com')
search = driver.find_element_by_id("search_keyword")
search.send_keys("plumbers")
place = driver.find_element_by_id("search_location")
place.send_keys("London")
driver.find_element_by_xpath("""//*[#id="searchBoxForm"]/fieldset/div[1]/div[3]/button""").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
Output = []
for names in soup.find_all("span", {"class": "businessCapsule--name"}):
Output.append(names.text)
with open('comple16.csv', 'w') as csv_file:
csv.register_dialect('custom', delimiter='\n', quoting=csv.QUOTE_NONE, escapechar='\\')
writer = csv.writer(csv_file, 'custom')
row = Output
writer.writerow(row)

After:
Output = soup.find_all("span", {"class": "businessCapsule--name"})
add:
Output = [row.text for row in Output]
in order to extract text from SPAN fields.

Related

Is it possible to scrape an attribute inside a span?

So I want to scrape some phone numbers from a site. The only problem is that they are hidden behind a click. I can't go and click all of them to make them scrape-able so I wanted to ask if there is any way to get them from the 'data-phone' attribute inside the span tag.
I tried to use data_='data-phone' but that doesn't work.
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('https://software-overzicht.nl/amersfoort?page=1').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['title, location'])
for number in soup.find_all('span', data_='data-phone'):
print(number)
for info in soup.find_all('div', class_='company-info-top'):
title = info.a.text
location = info.p.text
csv_writer.writerow([title, location])
csv_file.close()
change
for number in soup.find_all('span', data_='data-phone'):
print(number)
to
for number in soup.find_all('span', class_='phone'):
print(number['data-phone'])
Output:
0334226800
0878739737
0334558584
0334798200
0334720311
0334677050
0334554948
0334535384
0337767840
0334560292
0626214363
0334559065
0334506506
0620423525
0334556166
0332012581
0334557485
0334946111
0334536200
0334545111
0334545430
0337851805
033-4721544
06-26662490
To incorporate that into your csv:
from bs4 import BeautifulSoup
import requests
import csv
with open('C:/cms_scrape.csv','w', newline='') as f:
csv_writter = csv.writer(f)
csv_writter.writerow(['naambedrijf', 'adress', 'phone'])
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text.strip()
adress = search.p.text.strip()
try:
phone = search.find('span', {'class':'phone'})['data-phone']
except:
phone = 'N/A'
print(title)
csv_writter.writerow([title,adress,phone])

How can I scrape multiple pages using Beautiful Soup?

How can I scrape multiple pages from a website? This code is only working for the first one:
import csv
import requests
from bs4 import BeautifulSoup
import datetime
filename = "azet_" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Descriere","Pret","Data"])
r = requests.get("https://azetshop.ro/12-extensa?page=1")
soup = BeautifulSoup(r.text, "html.parser")
x = soup.find_all("div", "thumbnail")
for thumbnail in x:
descriere = thumbnail.find("h3").text.strip()
pret = thumbnail.find("price").text.strip()
writer.writerow([descriere, pret, datetime.datetime.now()])
For multiple pages scraping using BeautifulSoup, many usually do it using while:
import csv
import requests
from bs4 import BeautifulSoup
import datetime
end_page_num = 50
filename = "azet_" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Descriere","Pret","Data"])
i = 1
while i <= end_page_num:
r = requests.get("https://azetshop.ro/12-extensa?page={}".format(i))
soup = BeautifulSoup(r.text, "html5lib")
x = soup.find_all("div", {'class': 'thumbnail-container'})
for thumbnail in x:
descriere = thumbnail.find('h1', {"class": "h3 product-title"}).text.strip()
pret = thumbnail.find('span', {"class": "price"}).text.strip()
writer.writerow([descriere, pret, datetime.datetime.now()])
i += 1
Here i will change with increment of 1 as scraping of a page is completed.
This will continue scraping till end_page_num you have defined.
This code works fine too to use class attribute with bs4:
import csv
import requests
from bs4 import BeautifulSoup
import datetime
filename = "azet_" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Descriere","Pret","Data"])
for i in range(1,50):
r = requests.get("https://azetshop.ro/12-extensa?page="+format(i))
soup = BeautifulSoup(r.text, "html.parser")
array_price= soup.find_all('span', class_='price')
array_desc=soup.find_all('h1', class_='h3 product-title',text=True)
for iterator in range(0,len(array_price)):
descriere = array_desc[iterator].text.strip()
pret = array_price[iterator].text.strip()
writer.writerow([descriere, pret, datetime.datetime.now()])

Python 3.6: csvwriter only writes the two first strings

I am trying to webscrape soccer team-names and odds from a webpage to a csv file. My problem is that it's only writing the first two strings into a csv-file. Can anyone see what I'm doing wrong?
When I print it it works perfectly as seen on the picture but when I extract it to a csv file it's empty
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
url = "https://1xbet.cm/en/live/Football/"
driver = webdriver.Chrome("C:/Users/Christian/Desktop/WebScraper 0.5/chromedriver/chromedriver.exe")
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
containers = soup.findAll("div", {"class": "c-events__item_col"})
for container in containers:
teams = [x.get_text() for x in container.findAll(
"span", {"class": "c-events__team"}
)]
odds = [x.attrs.get('data-coef') for x in container.findAll(
"a", {"class": "c-bets__bet"}
)]
#print(teams)
#print(odds)
#print()
# name of csv file
filename = "C:/Users/Christian/Desktop/WebScraper 0.5/1xbetLiveOdds.csv"
# writing to csv file
with open(filename, 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
csvwriter.writerow(teams)
csvwriter.writerow(odds)
It seems that you want something like this:
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
url = "https://1xbet.cm/en/live/Football/"
driver = webdriver.Chrome("C:/Users/Christian/Desktop/WebScraper 0.5/chromedriver/chromedriver.exe")
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
containers = soup.findAll("div", {"class": "c-events__item_col"})
teams_odds = []
for container in containers:
teams_odds.append([x.get_text() for x in container.findAll(
"span", {"class": "c-events__team"}
)])
teams_odds.append([x.attrs.get('data-coef') for x in container.findAll(
"a", {"class": "c-bets__bet"}
)])
# name of csv file
filename = "C:/Users/Christian/Desktop/WebScraper 0.5/1xbetLiveOdds.csv"
# writing to csv file
with open(filename, 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
for toe in teams_odds:
csvwriter.writerow(toe)

Writing to scraped links to a CSV file using Python3

I have scraped a website for html links and have a result of about 500 links. When I try to write them to a csv file, I do not get the list only the base page.
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
writer.writerow([web_links])
csvfile.close()
I only get two lines in my csv file. The header 'Links' and www.census.gov. I have tried making it different by add another for loop in the csv writer area, but I get similar results.
for link in soup.find_all('a'):
web_links = link.get('href')
abs_url = join(page, web_links)
print(abs_url)
if abs_url and abs_url not in link_set:
writer.write(str(abs_url) + "\n")
link_set.add(abs_url)
It seems the 'web_links' definition should be where I put all the links into the csv file, but no dice. Where am I making my mistake?
In your code, you are writing two row in csv i.e.
writer.writerow(['Links'])
writer.writerow([web_links])
Here web_links is the last instance of retrieved href value.
I don't see the use of set instance. You can print and write in the csv without using set instance in following way :
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in soup.find_all('a'):
web_links = link.get("href")
if web_links:
print(web_links)
writer.writerow([web_links])
csvfile.close()
You have never added the scrapped links to your set():
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
link_set.add(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in link_set:
writer.writerow([link])
csvfile.close()

Trouble dealing with header in a csv file

I've written some code using python to scrape some titles and price from a webpage and write the results in a csv file. The script is running awesome. As I'm appending data to a csv file the script is writing headers in such a way that if it runs 4 loops then the headers will be written 4 times. How to fix it so that the headers will be written once. Thanks.
This is the script:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
for link in diction_page:
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
with open('item.csv','a',newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow(["Title","Price"])
writer.writerow([title, price])
As an option you can try this:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
for i,link in enumerate(diction_page):
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
with open('item.csv','a',newline='') as outfile:
writer = csv.writer(outfile)
if (i == 0):
writer.writerow(["Title","Price"])
writer.writerow([title, price])
Don't write the headers in the for loop:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
outfile = open('item.csv','w',newline='')
writer = csv.writer(outfile)
writer.writerow(["Title","Price"])
for link in diction_page:
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
writer.writerow([title, price])
outfile.close()

Categories

Resources