How to improve this webscraping python script? - python

Brief context, I started Python two weeks ago so don't hesitate to correct any mistake or improvement you see.I am trying to scrape data from the results club list of the site www.fff.fr .
My way of organizing it is:
Go to Homepage
Accept Cookies
Use search bar for cityname
Get result list
Follow each url of the result page
Go to each "Staff" sub-section
Extract data from this page
I started to build the below python code which is not working so far. I'd be really interested in feedback on how to actually make it work.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from shutil import which
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get("https://fff.fr")
cookie_btn = driver.find_element_by_id("didomi-notice-agree-button")
cookie_btn.click()
search_input = driver.find_element_by_xpath("/html//form[#id='proximiteSearch']//input[#id='fff_club_form_club_near_to_search_address']")
search_input.send_keys("Paris")
search_input.send_keys(Keys.ENTER)
self.html = driver.page_source
driver.close()
def parse(self, response):
resp = Selector(text=self.html)
clubs = resp.xpath("(//ul[contains(#id, 'listresulclub')])/li/text()")
for club in clubs:
name = club.xpath(".//text()").get()
name_link = club.xpath(".//#href").get()
url = f"https://www.ffr.fr{name_link}"
absolute_url = url[:-10] + "/le-staff"
# absolute_url = response.urljoin()
yield scrapy.Request(url=absolute_url, meta={'club_name':name})
#yield response.follow (url = name_link, callback=self.parse_country, meta={'club_name': name})
def parse_country(self, response):
name = response.request.meta['club_name']
contacts = response.xpath("//div[#class='coor-block-content']/ol")
for contact in contacts:
contact_nom = contact.xpath(".//li[1]/text()").get()
yield {
'club_name': name,
'correspondant_nom': contact_nom
}

You can try the same thing without selenium and it works:
import bs4
import requests
import sys
import re
import unicodedata
import os
import random
import datetime
Current_Date_Formatted = datetime.datetime.today().strftime ('%d-%b-%Y')
time = str(Current_Date_Formatted)
filename = "footballstuff"
cityname = sys.argv[1]
filename=r"D:\Huzefa\Desktop\\" +filename+ ".txt"
url = "https://www.fff.fr/resultats?search="+cityname
res = requests.get(url)
soup = bs4.BeautifulSoup(res.text, "lxml")
file = open(filename , 'wb')
for i in soup.select("a"):
f=i.text
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", f)).encode('ascii', 'ignore'))
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", os.linesep)).encode('ascii', 'ignore'))
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", os.linesep)).encode('ascii', 'ignore'))
file.close()

Related

CNN Scraper sporadically working in python

I've tried to create a Web Scraper for CNN. My goal is to scrape all news articles within the search query. Sometimes I get an output for some of the scraped pages and sometimes it doesn't work at all.
I am using selenium and BeautifulSoup packages in Jupiter Notebook. I am iterating over the pages via the url parameters &page={}&from={}. I tried by.XPATH before and simply clicking the next button at the end of the page, but it gave me the same results.
Here's the code I'm using:
#0 ------------import libraries
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import feedparser
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle
import pandas as pd
#3 ------------CNN SCRAPER
#3.1 ----------Define Funktion
def CNN_Scraper(max_pages):
base = "https://edition.cnn.com/"
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
load_content = browser.implicitly_wait(30)
base_url = 'https://edition.cnn.com/search?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100'
#-------------Define empty lists to be scraped
CNN_title = []
CNN_date = []
CNN_article = []
article_count = 0
#-------------iterate over pages and extract
for page in range(1, max_pages + 1):
print("Page %d" % page)
url= base_url + "&page=%d&from=%d" % (page, article_count)
browser.get(url)
load_content
soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('div', {'class':'cnn-search__results-list'})
contents = search_results.find_all('div', {'class':'cnn-search__result-contents'})
for content in contents:
try:
title = content.find('h3').text
print(title)
link = content.find('a')
link_url = link['href']
date = content.find('div',{'class':'cnn-search__result-publish-date'}).text.strip()
article = content.find('div',{'class':'cnn-search__result-body'}).text
except:
print("loser")
continue
CNN_title.append(title)
CNN_date.append(date)
CNN_article.append(article)
article_count += 100
print("-----")
#-------------Save in DF
df = pd.DataFrame()
df['title'] = CNN_title
df['date'] = CNN_date
df['article'] = CNN_article
df['link']=CNN_link
return df
#print("Complete")
browser.quit()
#3.2 ----------Call Function - Scrape CNN and save pickled data
CNN_data = CNN_Scraper(2)
#CNN_data.to_pickle("CNN_data")
Call the back-end API directly. For more details check my previous answer
import requests
import json
def main(url):
with requests.Session() as req:
for item in range(1, 1000, 100):
r = req.get(url.format(item)).json()
for a in r['result']:
print("Headline: {}, Url: {}".format(
a['headline'], a['url']))
main("https://search.api.cnn.io/content?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100&from={}")

Trying to scrape email address from website

I was trying to scrape this website:
www.united-church.ca/search/locator/all?keyw=&mission_units_ucc_ministry_type_advanced=10&locll=
I did scrape it using Scrapy, but I couldn't scrape email addresses. Can anyone help me?
Here is my code so far:
# -*- coding: utf-8 -*-
import scrapy
from ..items import ChurchItem
class ChurchSpiderSpider(scrapy.Spider):
name = 'church_spider'
page_number = 1
start_urls = ['https://www.united-church.ca/search/locator/all?keyw=&mission_units_ucc_ministry_type_advanced=10&locll=']
def parse(self, response):
items = ChurchItem()
container = response.css(".icon-ministry")
for t in container:
church_name = t.css(".field-name-locator-ministry-title a::text").extract()
church_phone = t.css(".field-name-field-phone::text").extract()
church_address = t.css(".thoroughfare::text").extract()
church_email = t.css(".field-name-field-mu-email span::text").extract()
items["church_name"] = church_name
items["church_phone"] = church_phone
items["church_address"] = church_address
items["church_email"] = church_email
yield items
# next_page = 'https://www.united-church.ca/search/locator/all?keyw=&mission_units_ucc_ministry_type_advanced=10&locll=&page=' + str(ChurchSpiderSpider.page_number)
# if ChurchSpiderSpider.page_number <= 110:
# ChurchSpiderSpider.page_number += 1
# yield response.follow(next_page, callback=self.parse)
I have found a partial solution, but it is still not complete. The output is now like:
{'church_address': ['7763 Highway 21'],
'church_email': ['herbklaehn', ' [at] ', 'gmail.com'],
'church_name': ['Allenford United Church'],
'church_phone': ['519-35-6232']}
How do I replace [at] with # and combine the email address into one string?
Using Beautiful Soup
A simple way to get the email is to look for the div with class=field-name-field-mu-email', and then replace the odd display to a proper email format.
For instance:
from bs4 import BeautifulSoup
url = 'https://www.united-church.ca/search/locator/all?keyw=&mission_units_ucc_ministry_type_advanced=10&locll='
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for div in soup.findAll('div', attrs={'class': 'field-name-field-mu-email'}):
print (div.find('span').text.replace(' [at] ', '#'))
Out[1]:
alpcharge#sasktel.net
guc-eug#bellnet.ca
pioneerpastoralcharge#gmail.com
acmeunitedchurch#gmail.com
cmcphers#lakeheadu.ca
mbm#kos.net
tommaclaren#gmail.com
agassizunited#shaw.ca
buchurch#xplornet.com
dmitchell008#yahoo.ca
karen.charlie62#gmail.com
trinityucbdn#westman.wave.ca
gepc.ucc.mail#gmail.com
monacampbell181#gmail.com
herbklaehn#gmail.com
You can try webscraping using Selenium, I tried this code and its giving perfect results.
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome("chromedriver")
driver.get("https://www.united-church.ca/search/locator/all?keyw=&mission_units_ucc_ministry_type_advanced=10&locll=")
content = driver.page_source
soup = BeautifulSoup(content)
for all_emails in soup.find_all('a',class_="spamspan"):
print(all_emails.text)
Results:
alpcharge#sasktel.net
guc-eug#bellnet.ca
pioneerpastoralcharge#gmail.com
acmeunitedchurch#gmail.com
cmcphers#lakeheadu.ca
mbm#kos.net
tommaclaren#gmail.com
agassizunited#shaw.ca
buchurch#xplornet.com
dmitchell008#yahoo.ca
karen.charlie62#gmail.com
trinityucbdn#westman.wave.ca
gepc.ucc.mail#gmail.com
monacampbell181#gmail.com
herbklaehn#gmail.com

I can't find an element using BeautifulSoup?

I am trying to make a bot to scrape this page http://www.skysports.com/transfer-centre, when I open it in Chrome I can see the element I want in developer tool
but when I use the code below, it return None:
import requests
from bs4 import BeautifulSoup
page = requests.get('http://www.skysports.com/transfer-centre').text
soup = BeautifulSoup(page, 'lxml')
print(soup.find('time'))
Am I missing something?
for extracting time from the url provided,
https://data.livefyre.com/bs3/v3.1/bskyb.fyre.co/363166/MTAwMDk1MTI=/init
Url I got from http://www.skysports.com/transfer-centre
import requests
import time
import json
page = requests.get('https://data.livefyre.com/bs3/v3.1/bskyb.fyre.co/363166/MTAwMDk1MTI=/init').json()
contents = page['headDocument']['content']
for content in contents:
c = content['content']
t = c['updatedAt']
tt = time.strftime('%H:%M', time.localtime(t))
authorId = c['authorId']
bodyHtml = c['bodyHtml']
print('time : {} , auther: {}'.format(tt, authorId))
print('------------')
print('bodyHtml: {}' , json.dumps(bodyHtml, None, 4))
print('-----------\n\n')

In Python3, how can I use the .append function to add a string to scraped links?

Thanks to stackoverflow.com I was able write a program that scrapes web links from any given web page. However, I need it to concatenate the home URL to any relative link that it comes across. (Example: "http://www.google.com/sitemap" is okay. But just "/sitemap" by itself is not okay.)
In the following code,
from bs4 import BeautifulSoup as mySoup
from urllib.parse import urljoin as myJoin
from urllib.request import urlopen as myRequest
base_url = "https://www.census.gov/programs-surveys/popest.html"
html_page = myRequest(base_url)
raw_html = html_page.read()
page_soup = mySoup(raw_html, "html.parser")
html_page.close()
f = open("census4-3.csv", "w")
all_links = page_soup.find_all('a', href=True)
def clean_links(tags, base_url):
cleaned_links = set()
for tag in tags:
link = tag.get('href')
if link is None:
continue
full_url = myJoin(base_url, link)
cleaned_links.add(full_url)
return cleaned_links
cleaned_links = clean_links(all_links, base_url)
for link in cleaned_links:
f.write(str(link) + '\n')
f.close()
print("The CSV file is saved to your computer.")
how and where would I add something like this:
.append("http://www.google.com")
You should save your base url as base_url = 'https://www.census.gov'.
Call the requests like this
html_page = myRequest(base_url + '/programs-surveys/popest.html')
When you want to get any full_url, just do this
full_url = base_url + link

I'm attempting to extract some links from craigslist using beautifulsoup but it's pulling the links 100 times rather than once

So I'm attempting to extract the links of the most recent tv listings from craigslist. I've gotten it to the point where I get the information that I want but, for some reason it's pulling that information like 100 times before it moves on to the next link. I'm not sure why it's doing that?
import urllib2
from bs4 import BeautifulSoup
import re
import time
import csv
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
# id url
url = ('http://omaha.craigslist.org/sya/')
# this opens the url
ourUrl = opener.open(url).read()
# now we are passing the url to beautiful soup
soup = BeautifulSoup(ourUrl)
for link in soup.findAll('a', attrs={'class': re.compile("hdrlnk")}):
find = re.compile('/sys/(.*?)"')
#time.sleep(1)
timeset = time.strftime("%m-%d %H:%M") # current date and time
for linka in soup.findAll('a', attrs={'href': re.compile("^/sys/")}):
find = re.compile('/sys/(.*?)"')
searchTv = re.search(find, str(link))
Tv = searchTv.group(1)
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
url = ('http://omaha.craigslist.org/sys/' + Tv)
ourUrl = opener.open(url).read()
soup = BeautifulSoup(ourUrl)
print "http://omaha.craigslist.org/sys/" + Tv
try:
outfile = open('C:/Python27/Folder/Folder/Folder/craigstvs.txt', 'a')
outfile.write(timeset + "; " + link.text + "; " + "http://omaha.craigslist.org/sys/" + Tv + '\n')
timeset = time.strftime("%m-%d %H:%M") # current date and time
except:
print "No go--->" + str(link.text)
Here is an example of what it outputs: 08-10 15:19; MAC mini intel core wifi dvdrw great cond ; http://omaha.craigslist.org/sys/4612480593.html
Which is exactly what I'm trying to accomplish except it extracts that that info about 100+ times. And then moves on the next listing... I'm at a stand still and can't figure it out.
Any help would be appreciated, thanks ahead of time!
Scrapy for #alexce:
import scrapy
import csv
from tutorial.items import DmozItem
import re
import urllib2
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
import html2text
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["http://omaha.craigslist.org"]
start_urls = [
"http://omaha.craigslist.org/sya/",
]
def parse(self, response):
for sel in response.xpath('//html'):
#title = sel.xpath('a/text()').extract()
link = sel.xpath('/html/body/article/section/div/div[2]/p/span/span[2]/a').extract()[0:4]
#at this point it doesn't repeat itself, which is good!
#desc = sel.xpath('text()').extract()
print link
You don't need a nested loop here. Other notes/improvements:
opener.open() result can be passed directly to BeautifulSoup constructor, no need for read()
urlopener can be defined once and reused in the loop to follow links
use find_all() instead of findAll()
use urljoin() for concatenating url parts
use csv module for writing the delimited data
use with context manager while dealing with files
The complete fixed version:
import csv
import re
import time
import urllib2
from urlparse import urljoin
from bs4 import BeautifulSoup
BASE_URL = 'http://omaha.craigslist.org/sys/'
URL = 'http://omaha.craigslist.org/sya/'
FILENAME = 'C:/Python27/Folder/Folder/Folder/craigstvs.txt'
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
soup = BeautifulSoup(opener.open(URL))
with open(FILENAME, 'a') as f:
writer = csv.writer(f, delimiter=';')
for link in soup.find_all('a', class_=re.compile("hdrlnk")):
timeset = time.strftime("%m-%d %H:%M")
item_url = urljoin(BASE_URL, link['href'])
item_soup = BeautifulSoup(opener.open(item_url))
# do smth with the item_soup? or why did you need to follow this link?
writer.writerow([timeset, link.text, item_url])
And here is what the code produces:
08-10 16:56;Dell Inspiron-15 Laptop;http://omaha.craigslist.org/sys/4612666460.html
08-10 16:56;computer????;http://omaha.craigslist.org/sys/4612637389.html
08-10 16:56;macbook 13 inch 160 gig wifi dvdrw ;http://omaha.craigslist.org/sys/4612480237.html
08-10 16:56;MAC mini intel core wifi dvdrw great cond ;http://omaha.craigslist.org/sys/4612480593.html
...
Just a side note, since you need to follow the links, get the data and output it into a csv file..it is just sounds like Scrapy would be a very good fit here. There are Rules, Link Extractors and it can serialize crawled items into csv out of the box.

Categories

Resources