Writing the news to CSV-file (Python 3, BeautifulSoup) - python

I want Python3.6 to write the output of the following code into a csv. It would be very nice to have it like this: one row for every article (it's a News-Website) and four columns with "Title", "URL", "Category" [#Politik, etc.], "PublishedAt".
from bs4 import BeautifulSoup
import requests
website = 'http://spiegel.de/schlagzeilen'
r = requests.get(website)
soup = BeautifulSoup((r.content), "lxml")
div = soup.find("div", {"class": "schlagzeilen-content schlagzeilen-overview"})
for a in div.find_all('a', title=True):
print(a.text, a.find_next_sibling('span').text)
print(a.get('href'))
For writing to a csv I already have this...
with open('%s_schlagzeilen.csv' % datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S.%f'), 'w', newline='',
encoding='utf-8') as file:
w = csv.writer(file, delimiter="|")
w.writerow([...])
..and need to know what's next to do. THX!! in advance!

You can collect all the desired extracted fields into a list of dictionaries and use the csv.DictWriter to write to the CSV file:
import csv
import datetime
from bs4 import BeautifulSoup
import requests
website = 'http://spiegel.de/schlagzeilen'
r = requests.get(website)
soup = BeautifulSoup((r.content), "lxml")
articles = []
for a in soup.select(".schlagzeilen-content.schlagzeilen-overview a[title]"):
category, published_at = a.find_next_sibling(class_="headline-date").get_text().split(",")
articles.append({
"Title": a.get_text(),
"URL": a.get('href'),
"Category": category.strip(" ()"),
"PublishedAt": published_at.strip(" ()")
})
filename = '%s_schlagzeilen.csv' % datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S.%f')
with open(filename, 'w', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=["Title", "URL", "Category", "PublishedAt"], )
writer.writeheader()
writer.writerows(articles)
Note how we are locating the categories and the "published at" - we need to go to the next sibling element and split the text by comma, stripping out the extra parenthesis.

Related

Scrape information from multiple URLs listed in a CSV using BeautifulSoup and then export these results to a new CSV file

I have a 45k+ rows CSV file, each one containing a different path of the same domain - which are structurally identical to each other - and every single one is clickable. I managed to use BeautifulSoup to scrape the title and content of each one and through the print function, I was able to validate the scraper. However, when I try to export the information gathered to a new CSV file, I only get the last URL's street name and description, and not all of them as I expected.
from bs4 import BeautifulSoup
import requests
import csv
with open('URLs.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
site = requests.get(row['addresses']).text
soup = BeautifulSoup(site, 'lxml')
StreetName = soup.find('div', class_='hist-title').text
Description = soup.find('div', class_='hist-content').text
with open('OutputList.csv','w', newline='') as output:
Header = ['StreetName', 'Description']
writer = csv.DictWriter(output, fieldnames=Header)
writer.writeheader()
writer.writerow({'StreetName' : StreetName, 'Description' : Description})
How can the output CSV have on each row the street name and description for the respective URL row in the input CSV file?
You need to open both files on the same level and then read and write on each iteration. Something like this:
from bs4 import BeautifulSoup
import requests
import csv
with open('URLs.csv') as a, open('OutputList.csv', 'w') as b:
reader = csv.reader(a)
writer = csv.writer(b, quoting=csv.QUOTE_ALL)
writer.writerow(['StreetName', 'Description'])
# Assuming url is the first field in the CSV
for url, *_ in reader:
r = requests.get(url)
if r.ok:
soup = BeautifulSoup(r.text, 'lxml')
street_name = soup.find('div', class_='hist-title').text.strip()
description = soup.find('div', class_='hist-content').text.strip()
writer.writerow([street_name, description])
I hope it helps.

Python 3.6: csvwriter only writes the two first strings

I am trying to webscrape soccer team-names and odds from a webpage to a csv file. My problem is that it's only writing the first two strings into a csv-file. Can anyone see what I'm doing wrong?
When I print it it works perfectly as seen on the picture but when I extract it to a csv file it's empty
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
url = "https://1xbet.cm/en/live/Football/"
driver = webdriver.Chrome("C:/Users/Christian/Desktop/WebScraper 0.5/chromedriver/chromedriver.exe")
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
containers = soup.findAll("div", {"class": "c-events__item_col"})
for container in containers:
teams = [x.get_text() for x in container.findAll(
"span", {"class": "c-events__team"}
)]
odds = [x.attrs.get('data-coef') for x in container.findAll(
"a", {"class": "c-bets__bet"}
)]
#print(teams)
#print(odds)
#print()
# name of csv file
filename = "C:/Users/Christian/Desktop/WebScraper 0.5/1xbetLiveOdds.csv"
# writing to csv file
with open(filename, 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
csvwriter.writerow(teams)
csvwriter.writerow(odds)
It seems that you want something like this:
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
url = "https://1xbet.cm/en/live/Football/"
driver = webdriver.Chrome("C:/Users/Christian/Desktop/WebScraper 0.5/chromedriver/chromedriver.exe")
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
containers = soup.findAll("div", {"class": "c-events__item_col"})
teams_odds = []
for container in containers:
teams_odds.append([x.get_text() for x in container.findAll(
"span", {"class": "c-events__team"}
)])
teams_odds.append([x.attrs.get('data-coef') for x in container.findAll(
"a", {"class": "c-bets__bet"}
)])
# name of csv file
filename = "C:/Users/Christian/Desktop/WebScraper 0.5/1xbetLiveOdds.csv"
# writing to csv file
with open(filename, 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
for toe in teams_odds:
csvwriter.writerow(toe)

How do I take a list of urls that are stored in a csv file and import them to python to scrape and export back to csv with newly gathered data

I am very new to python and BeautifulSoup and I am trying to use it to scrape multiple urls at the same time using a loop. The loop will consist of locating the banner slide on the home page of each website and get the len of how many banners that website has and place them into an excel file next to the corresponding url. I have a list of urls saved in a csv file and basically what I want to do is take each of those urls and run the loop, pulling the number of banners, and put that number next to the url into a separate column in excel.
This is the code I have so far and all it does for me is write the urls back into a csv file and gives me the number of banners for only the last url.
from bs4 import BeautifulSoup
import requests
with open("urls.csv", "r") as f:
csv_raw_cont=f.read()
split_csv=csv_raw_cont.split('\n')
split_csv.remove('')
separator=';'
filename = "DDC_number_of_banners.csv"
f = open(filename, "w")
headers = "url, Number_of_Banners\n"
f.write(headers)
for each in split_csv:
url_row_index=0
url = each.split(separator)[url_row_index]
html = requests.get(url).content
soup= BeautifulSoup(html, "html.parser")
banner_info = soup.findAll('div',{'class':['slide', 'slide has-link',
'html-slide slide has-link']})
Number_of_banners = len(banner_info)
f.write(csv_raw_cont + "," + str(Number_of_banners) + "," + "\n")
f.close()
Making use of Python's CSV library would make this a bit simpler:
from bs4 import BeautifulSoup
import requests
import csv
with open("urls.csv", "r") as f_urls, open("DDC_number_of_banners.csv", "w", newline="") as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['url', 'Number_of_banners'])
for url in f_urls:
url = url.strip()
html = requests.get(url).content
soup = BeautifulSoup(html, "html.parser")
banner_info = soup.findAll('div',{'class':['slide', 'slide has-link', 'html-slide slide has-link']})
csv_output.writerow([url, len(banner_info)])
To include information such as each banner's data-label:
from bs4 import BeautifulSoup
import requests
import csv
with open("urls.csv", "r") as f_urls, open("DDC_number_of_banners.csv", "w", newline="") as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['url', 'Number_of_banners', 'data_labels'])
for url in f_urls:
url = url.strip()
html = requests.get(url).content
soup = BeautifulSoup(html, "html.parser")
banner_info = soup.findAll('div',{'class':['slide', 'slide has-link', 'html-slide slide has-link']})
data_labels = [banner.get('data-label') for banner in banner_info]
csv_output.writerow([url, len(banner_info)] + data_labels)

Writing to scraped links to a CSV file using Python3

I have scraped a website for html links and have a result of about 500 links. When I try to write them to a csv file, I do not get the list only the base page.
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
writer.writerow([web_links])
csvfile.close()
I only get two lines in my csv file. The header 'Links' and www.census.gov. I have tried making it different by add another for loop in the csv writer area, but I get similar results.
for link in soup.find_all('a'):
web_links = link.get('href')
abs_url = join(page, web_links)
print(abs_url)
if abs_url and abs_url not in link_set:
writer.write(str(abs_url) + "\n")
link_set.add(abs_url)
It seems the 'web_links' definition should be where I put all the links into the csv file, but no dice. Where am I making my mistake?
In your code, you are writing two row in csv i.e.
writer.writerow(['Links'])
writer.writerow([web_links])
Here web_links is the last instance of retrieved href value.
I don't see the use of set instance. You can print and write in the csv without using set instance in following way :
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in soup.find_all('a'):
web_links = link.get("href")
if web_links:
print(web_links)
writer.writerow([web_links])
csvfile.close()
You have never added the scrapped links to your set():
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
link_set.add(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in link_set:
writer.writerow([link])
csvfile.close()

Trouble dealing with header in a csv file

I've written some code using python to scrape some titles and price from a webpage and write the results in a csv file. The script is running awesome. As I'm appending data to a csv file the script is writing headers in such a way that if it runs 4 loops then the headers will be written 4 times. How to fix it so that the headers will be written once. Thanks.
This is the script:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
for link in diction_page:
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
with open('item.csv','a',newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow(["Title","Price"])
writer.writerow([title, price])
As an option you can try this:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
for i,link in enumerate(diction_page):
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
with open('item.csv','a',newline='') as outfile:
writer = csv.writer(outfile)
if (i == 0):
writer.writerow(["Title","Price"])
writer.writerow([title, price])
Don't write the headers in the for loop:
import csv
import requests
from bs4 import BeautifulSoup
diction_page = ['http://www.bloomberg.com/quote/SPX:IND','http://www.bloomberg.com/quote/CCMP:IND']
outfile = open('item.csv','w',newline='')
writer = csv.writer(outfile)
writer.writerow(["Title","Price"])
for link in diction_page:
res = requests.get(link).text
soup = BeautifulSoup(res,'lxml')
title = soup.select_one('.name').text.strip()
price = soup.select_one('.price').text
print(title,price)
writer.writerow([title, price])
outfile.close()

Categories

Resources