I want to scrape urls of all titles using python - python

I wrote a code to get all the title urls but have some issues like it displays None values. So could you please help me out?
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_index_data(soup):
try:
titles_link = soup.find_all('div',class_="marginTopTextAdjuster")
except:
titles_link = []
urls = [item.get('href') for item in titles_link]
print(urls)
def main():
#url = "http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1"
mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
#get_page(url)
get_index_data(get_page(mainurl))
#write_csv(data,url)
if __name__ == '__main__':
main()

You are trying to get the href attribute of the div tag. Instead try selecting all the a tags. They seem to have a common class attribute body_link_11.
Use titles_link = soup.find_all('a',class_="body_link_11") instead of titles_link = soup.find_all('div',class_="marginTopTextAdjuster")

url = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
titles_link = []
titles_div = soup.find_all('div', attrs={'class': 'marginTopTextAdjuster'})
for link in titles_div:
tag = link.find_all('a', href=True)
try:
if tag[0].attrs.get('item_id', None):
titles_link.append({tag[0].text: tag[0].attrs.get('href', None)})
except IndexError:
continue
print(titles_link)
output:
[{'Civil Affairs Handbook, Japan, section 1a: population statistics.': '/cdm/singleitem/collection/p4013coll8/id/2653/rec/1'}, {'Army Air Forces Program 1943.': '/cdm/singleitem/collection/p4013coll8/id/2385/rec/2'}, {'Casualty report number II.': '/cdm/singleitem/collection/p4013coll8/id/3309/rec/3'}, {'Light armored division, proposed March 1943.': '/cdm/singleitem/collection/p4013coll8/id/2425/rec/4'}, {'Tentative troop list by type units for Blacklist operations.': '/cdm/singleitem/collection/p4013coll8/id/150/rec/5'}, {'Chemical Warfare Service: history of training, part 2, schooling of commissioned officers.': '/cdm/compoundobject/collection/p4013coll8/id/2501/rec/6'}, {'Horses in the German Army (1941-1945).': '/cdm/compoundobject/collection/p4013coll8/id/2495/rec/7'}, {'Unit history: 38 (MECZ) cavalry rcn. sq.': '/cdm/singleitem/collection/p4013coll8/id/3672/rec/8'}, {'Operations in France: December 1944, 714th Tank Battalion.': '/cdm/singleitem/collection/p4013coll8/id/3407/rec/9'}, {'G-3 Reports : Third Infantry Division. (22 Jan- 30 Mar 44)': '/cdm/singleitem/collection/p4013coll8/id/4393/rec/10'}, {'Summary of operations, 1 July thru 31 July 1944.': '/cdm/singleitem/collection/p4013coll8/id/3445/rec/11'}, {'After action report 36th Armored Infantry Regiment, 3rd Armored Division, Nov 1944 thru April 1945.': '/cdm/singleitem/collection/p4013coll8/id/3668/rec/12'}, {'Unit history, 38th Mechanized Cavalry Reconnaissance Squadron, 9604 thru 9665.': '/cdm/singleitem/collection/p4013coll8/id/3703/rec/13'}, {'Redeployment: occupation forces in Europe series, 1945-1946.': '/cdm/singleitem/collection/p4013coll8/id/2952/rec/14'}, {'Twelfth US Army group directives. Annex no. 1.': '/cdm/singleitem/collection/p4013coll8/id/2898/rec/15'}, {'After action report, 749th Tank Battalion: Jan, Feb, Apr - 8 May 45.': '/cdm/singleitem/collection/p4013coll8/id/3502/rec/16'}, {'743rd Tank Battalion, S3 journal history.': '/cdm/singleitem/collection/p4013coll8/id/3553/rec/17'}, {'History of military training, WAAC / WAC training.': '/cdm/singleitem/collection/p4013coll8/id/4052/rec/18'}, {'After action report, 756th Tank Battalion.': '/cdm/singleitem/collection/p4013coll8/id/3440/rec/19'}, {'After action report 92nd Cavalry Recon Squadron Mechanized 12th Armored Division, Jan thru May 45.': '/cdm/singleitem/collection/p4013coll8/id/3583/rec/20'}]

An easy way to do it with requests and BeautifulSoup:
import requests
from bs4 import BeautifulSoup
req = requests.get(url) # url stands for the page's url you want to find
soup = BeautifulSoup(req.text, "html.parser") # req.text is the complete html of the page
print(soup.title.string) # soup.title will give you the title of the page but with the <title> tags so .string removes them

Try this.
from simplified_scrapy import SimplifiedDoc,req,utils
url = 'http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1'
html = req.get(url)
doc = SimplifiedDoc(html)
lst = doc.selects('div.marginTopTextAdjuster').select('a')
titles_link = [(utils.absoluteUrl(url,a.href),a.text) for a in lst if a]
print (titles_link)
Result:
[('http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1', 'Civil Affairs Handbook, Japan, section 1a: population statistics.'), ('http://cgsc.cdmhost.com/cdm/landingpage/collection/p4013coll8', 'World War II Operational Documents'), ('http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2385/rec/2', 'Army Air Forces Program 1943.'),...

Related

Web scraping headlines and subtitles from a specific section with a specific tag id

Tried the following code to get the headlines and subtitles from the section https://www.theguardian.com/uk/environment with the id: environment/wildlife for the period of time October, November, December 2022.
Any ideas on what can be wrong? Thanks for your help.
`
r = requests.get('https://www.theguardian.com/uk/environment')
soup = BeautifulSoup(r.text, 'html.parser')
elements = soup.find_all('gu-island')
filtered_elements = [element for element in elements if element['props']['id'] == 'environment/wildlife']
headlines = []
for element in filtered_elements:
headlines.extend(element.find_all('h1'))
texts = [headline.text for headline in headlines]
print(texts)
`
Also tried with the keyword wildlife but no results.
`
r = requests.get('https://www.theguardian.com/uk/environment')
soup = BeautifulSoup(r.text, 'html.parser')
elements = soup.find_all('gu-island')
filtered_elements = [element for element in elements if 'wildlife' in element['props']['tags']]
headlines = []
for element in filtered_elements:
headlines.extend(element.find_all('h1'))
texts = [headline.text for headline in headlines]
print(texts)
`
The following code with the url of a specific article, does extract the headline subtitle but also the entire text of the article which I'm not requesting.
`
r = requests.get('https://www.theguardian.com/environment/2022/dec/30/tales-of-killer-wild-boar-in-uk-are-hogwash-say-environmentalists')
soup = BeautifulSoup(r.text, 'html.parser')
headlines = soup.find_all('h1')
standfirst = soup.find_all('p')
for headline in headlines:
print(headline.text)
for standfirst in standfirst:
print(standfirst.text)
`
Just inspect your page, I'm assuming you want the title for every article from the main page as you didn't specify it clearly.
With the information, you can search the section as follows,
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.theguardian.com/uk/environment')
soup = BeautifulSoup(r.text, 'html.parser')
items = soup.find_all("div", {"class": "fc-item__content"})
for item in items:
print(item.text.strip())
Tip: use .strip() to get the inner text within the A tag.
Output gives you:
The following script will extract all of the headlines from the environment/wildlife pages.
An example url from the 3rd Oct 2022 would be:
https://www.theguardian.com/environment/wildlife/2022/oct/03/all
You can modify the script to specify the required start_date and end_date.
Please note, you will have to specify an end_date that is one day beyond the end date you want.
All of the headlines within those dates will be stored in the headlines variable.
I have also introduced a sleep time of 10 seconds between page reads, to avoid being blocked by the website.
Code:
from bs4 import BeautifulSoup
import requests
from datetime import date, timedelta
import pandas as pd
from time import sleep
def get_headings(dt):
p = dt.strftime("%Y-%b-%d").split('-')
r = requests.get(f'https://www.theguardian.com/environment/wildlife/{p[0]}/{p[1].lower()}/{p[2]}/all')
soup = BeautifulSoup(r.text, 'html.parser')
elements = soup.select('div.fc-slice-wrapper')
headings = [h.text for h in elements[0].find_all(class_="js-headline-text")][::2]
return headings
def daterange(start_date, end_date):
for n in range(int((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = date(2022,10,1)
end_date = date(2022,10,4)
headlines = []
for single_date in daterange(start_date, end_date):
headlines.extend(get_headings(single_date))
sleep(10) # sleep 10 seconds between each page to avoid being blocked
for h in headlines:
print(h)
Output:
Gardeners beware: household chemicals banned overseas are still used in Australia
Cop15 is an opportunity to save nature. We can’t afford another decade of failure
Prince Harry wildlife NGO under fire after elephants kill three in Malawi
Country diary: Mysterious birdsong fills the air with sweetness
Tory MPs dismiss critical RSPB campaign as ‘marketing strategy’
Australia announces plan to halt extinction crisis and save 110 species
Sixty endangered greater gliders found in Victorian forests tagged for logging
Wales unveils plans to triple rate of peatland restoration
Europe and UK hit by ‘unprecedented’ number of bird flu cases this summer

Generating URL for Yahoo news and Bing news with Python and BeautifulSoup

I want to scrape data from Yahoo News and 'Bing News' pages. The data that I want to scrape are headlines or/and text below headlines (what ever It can be scraped) and dates (time) when its posted.
I have wrote a code but It does not return anything. Its the problem with my url since Im getting response 404
Can you please help me with it?
This is the code for 'Bing'
from bs4 import BeautifulSoup
import requests
term = 'usa'
url = 'http://www.bing.com/news/q?s={}'.format(term)
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)
And this is for Yahoo:
term = 'usa'
url = 'http://news.search.yahoo.com/q?s={}'.format(term)
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)
Please help me to generate these urls, whats the logic behind them, Im still a noob :)
Basically your urls are just wrong. The urls that you have to use are the same ones that you find in the address bar while using a regular browser. Usually most search engines and aggregators use q parameter for the search term. Most of the other parameters are usually not required (sometimes they are - eg. for specifying result page no etc..).
Bing
from bs4 import BeautifulSoup
import requests
import re
term = 'usa'
url = 'https://www.bing.com/news/search?q={}'.format(term)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for news_card in soup.find_all('div', class_="news-card-body"):
title = news_card.find('a', class_="title").text
time = news_card.find(
'span',
attrs={'aria-label': re.compile(".*ago$")}
).text
print("{} ({})".format(title, time))
Output
Jason Mohammed blitzkrieg sinks USA (17h)
USA Swimming held not liable by California jury in sexual abuse case (1d)
United States 4-1 Canada: USA secure payback in Nations League (1d)
USA always plays the Dalai Lama card in dealing with China, says Chinese Professor (1d)
...
Yahoo
from bs4 import BeautifulSoup
import requests
term = 'usa'
url = 'https://news.search.yahoo.com/search?q={}'.format(term)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for news_item in soup.find_all('div', class_='NewsArticle'):
title = news_item.find('h4').text
time = news_item.find('span', class_='fc-2nd').text
# Clean time text
time = time.replace('·', '').strip()
print("{} ({})".format(title, time))
Output
USA Baseball will return to Arizona for second Olympic qualifying chance (52 minutes ago)
Prized White Sox prospect Andrew Vaughn wraps up stint with USA Baseball (28 minutes ago)
Mexico defeats USA in extras for Olympic berth (13 hours ago)
...

Web-scraping articles from WSJ using Beautifulsoup in python 3.7?

I am trying to scrape articles from the Wall Street Journal using Beautifulsoup in Python. However, the code which I am running is executing without any error (exit code 0) but no results. I don't understand what is happening? Why this code is not giving expected results.
I even have paid a subscription.
I know that something is not right but I can't locate the problem.
import time
import requests
from bs4 import BeautifulSoup
url = 'https://www.wsj.com/search/term.html?KEYWORDS=cybersecurity&min-date=2018/04/01&max-date=2019/03/31' \
'&isAdvanced=true&daysback=90d&andor=AND&sort=date-desc&source=wsjarticle,wsjpro&page={}'
pages = 32
for page in range(1, pages+1):
res = requests.get(url.format(page))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".items.hedSumm li > a"):
resp = requests.get(item.get("href"))
_href = item.get("href")
try:
resp = requests.get(_href)
except Exception as e:
try:
resp = requests.get("https://www.wsj.com" + _href)
except Exception as e:
continue
sauce = BeautifulSoup(resp.text,"lxml")
date = sauce.select("time.timestamp.article__timestamp.flexbox__flex--1")
date = date[0].text
tag = sauce.select("li.article-breadCrumb span").text
title = sauce.select_one("h1.wsj-article-headline").text
content = [elem.text for elem in sauce.select("p.article-content")]
print(f'{date}\n {tag}\n {title}\n {content}\n')
time.sleep(3)
As I wrote in the code, I am trying to scrape date, title, tag, and content of all the articles. It would be helpful if I can get suggestions about my mistakes, what should I do to get the desired results.
Replace your code :
resp = requests.get(item.get("href"))
To:
_href = item.get("href")
try:
resp = requests.get(_href)
except Exception as e:
try:
resp = requests.get("https://www.wsj.com"+_href)
except Exception as e:
continue
Because most of item.get("href") is not providing proper website url for eg you are getting url like this.
/news/types/national-security
/public/page/news-financial-markets-stock.html
https://www.wsj.com/news/world
Only https://www.wsj.com/news/world is a valid website URL. so you need to concate base URL with _href.
Update:
import time
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
url = 'https://www.wsj.com/search/term.html?KEYWORDS=cybersecurity&min-date=2018/04/01&max-date=2019/03/31' \
'&isAdvanced=true&daysback=90d&andor=AND&sort=date-desc&source=wsjarticle,wsjpro&page={}'
pages = 32
for page in range(1, pages+1):
res = requests.get(url.format(page))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all("a",{"class":"headline-image"},href=True):
_href = item.get("href")
try:
resp = requests.get(_href)
except Exception as e:
try:
resp = requests.get("https://www.wsj.com"+_href)
except Exception as e:
continue
sauce = BeautifulSoup(resp.text,"lxml")
dateTag = sauce.find("time",{"class":"timestamp article__timestamp flexbox__flex--1"})
tag = sauce.find("li",{"class":"article-breadCrumb"})
titleTag = sauce.find("h1",{"class":"wsj-article-headline"})
contentTag = sauce.find("div",{"class":"wsj-snippet-body"})
date = None
tagName = None
title = None
content = None
if isinstance(dateTag,Tag):
date = dateTag.get_text().strip()
if isinstance(tag,Tag):
tagName = tag.get_text().strip()
if isinstance(titleTag,Tag):
title = titleTag.get_text().strip()
if isinstance(contentTag,Tag):
content = contentTag.get_text().strip()
print(f'{date}\n {tagName}\n {title}\n {content}\n')
time.sleep(3)
O/P:
March 31, 2019 10:00 a.m. ET
Tech
Care.com Removes Tens of Thousands of Unverified Listings
The online child-care marketplace Care.com scrubbed its site of tens of thousands of unverified day-care center listings just before a Wall Street Journal investigation published March 8, an analysis shows. Care.com, the largest site in the U.S. for finding caregivers, removed about 72% of day-care centers, or about 46,594 businesses, listed on its site, a Journal review of the website shows. Those businesses were listed on the site as recently as March 1....
Updated March 29, 2019 6:08 p.m. ET
Politics
FBI, Retooling Once Again, Sets Sights on Expanding Cyber Threats
The FBI has launched its biggest transformation since the 2001 terror attacks to retrain and refocus special agents to combat cyber criminals, whose threats to lives, property and critical infrastructure have outstripped U.S. efforts to thwart them. The push comes as federal investigators grapple with an expanding range of cyber attacks sponsored by foreign adversaries against businesses or national interests, including Russian election interference and Chinese cyber thefts from American companies, senior bureau executives...

Is there a better way to extract SIC and Fiscal Year End from this web page?

Here is the link: https://www.sec.gov/cgi-bin/browse-edgar?CIK=20&owner=exclude&action=getcompany&Find=Search
I want to extract SIC (i.e. 3823) and year end (i.e. 0102). My code is as follows. It works, but I feel it is cumbersome. What is a better way? Thanks.
#soup is a BeatutifulSoup soup object
link_tags = soup.find_all("a")
if link_tags:
for link in link_tags:
if "SIC=" in link.get("href"):
sic = link.string.strip()
re_yend = re.compile(r"Fiscal Year End: *(\d{4})")
match = re_yend.search(str(soup))
if match:
y_end = str(match.group(1))
Here is another way to get the data from the website:
import re
import requests
from bs4 import BeautifulSoup as bs
def get_data(url):
response = requests.get(url)
if response.status_code != 200:
raise ValueError('Cannot read the data')
return response.text
def get_sic_fiscal(data):
soup = bs(data, 'html.parser')
# Get the compagny info block
company_info = soup.find('div', {'class': 'companyInfo'})
# Get the acronym tag
acronym = company_info.find('acronym', {'title': 'Standard Industrial Code'})
# find the next url to acronym tag
sic = acronym.findNext('a')
# Reduce the search of the fiscal year end only
# in the compagny info block
fiscal_year_end = re.search(r'Fiscal Year End:\s+(\d+)', company_info.text)
if fiscal_year_end:
return sic.text, fiscal_year_end.group(1)
return sic.text, None
url = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK=20&owner=exclude&action=getcompany&Find=Search'
data = get_data(url)
sic, fiscal = get_sic_fiscal(data)
print('SIC: {sic} and Fiscal year end: {fiscal}'.format(sic=sic, fiscal=fiscal))
Output:
SIC: 3823 and Fiscal year end: 0102
You can simplify the search for SIC significantly with a css selector that looks for SIC in the href. Your approach to finding the fiscal year is pretty good, although there's no need to explicitly compile the regex and if you know the data will always be there, you could also eliminate the match check:
print(soup.select_one('.identInfo a[href*="SIC"]').text)
print(re.search(r"Fiscal Year End: *(\d+)", soup.text).group(1))
Result:
3823
0102

Web scraping with python request and beautiful soup

I am trying to scrape all the articles on this web page: https://www.coindesk.com/category/markets-news/markets-markets-news/markets-bitcoin/
I can scrape the first article, but need help understanding how to jump to the next article and scrape the information there. Thank you in advance for your support.
import requests
from bs4 import BeautifulSoup
class Content:
def __init__(self,url,title,body):
self.url = url
self.title = title
self.body = body
def getPage(url):
req = requests.get(url)
return BeautifulSoup(req.text, 'html.parser')
# Scaping news articles from Coindesk
def scrapeCoindesk(url):
bs = getPage(url)
title = bs.find("h3").text
body = bs.find("p",{'class':'desc'}).text
return Content(url,title,body)
# Pulling the article from coindesk
url = 'https://www.coindesk.com/category/markets-news/markets-markets-news/markets-bitcoin/'
content = scrapeCoindesk(url)
print ('Title:{}'.format(content.title))
print ('URl: {}\n'.format(content.url))
print (content.body)
You can use the fact that every article is contained within a div.article to iterate over them:
def scrapeCoindesk(url):
bs = getPage(url)
articles = []
for article in bs.find_all("div", {"class": "article"}):
title = article.find("h3").text
body = article.find("p", {"class": "desc"}).text
article_url = article.find("a", {"class": "fade"})["href"]
articles.append(Content(article_url, title, body))
return articles
# Pulling the article from coindesk
url = 'https://www.coindesk.com/category/markets-news/markets-markets-news/markets-bitcoin/'
content = scrapeCoindesk(url)
for article in content:
print(article.url)
print(article.title)
print(article.body)
print("-------------")
You can use find_all with BeautifulSoup:
from bs4 import BeautifulSoup as soup
from collections import namedtuple
import request, re
article = namedtuple('article', 'title, link, timestamp, author, description')
r = requests.get('https://www.coindesk.com/category/markets-news/markets-markets-news/markets-bitcoin/').text
full_data = soup(r, 'lxml')
results = [[i.text, i['href']] for i in full_data.find_all('a', {'class':'fade'})]
timestamp = [re.findall('(?<=\n)[a-zA-Z\s]+[\d\s,]+at[\s\d:]+', i.text)[0] for i in full_data.find_all('p', {'class':'timeauthor'})]
authors = [i.text for i in full_data.find_all('a', {'rel':'author'})]
descriptions = [i.text for i in full_data.find_all('p', {'class':'desc'})]
full_articles = [article(*(list(i[0])+list(i[1:]))) for i in zip(results, timestamp, authors, descriptions) if i[0][0] != '\n ']
Output:
[article(title='Topping Out? Bitcoin Bulls Need to Defend $9K', link='https://www.coindesk.com/topping-out-bitcoin-bulls-need-to-defend-9k/', timestamp='May 8, 2018 at 09:10 ', author='Omkar Godbole', description='Bitcoin risks falling to levels below $9,000, courtesy of the bearish setup on the technical charts. '), article(title='Bitcoin Risks Drop Below $9K After 4-Day Low', link='https://www.coindesk.com/bitcoin-risks-drop-below-9k-after-4-day-low/', timestamp='May 7, 2018 at 11:00 ', author='Omkar Godbole', description='Bitcoin is reporting losses today but only a break below $8,650 would signal a bull-to-bear trend change. '), article(title="Futures Launch Weighed on Bitcoin's Price, Say Fed Researchers", link='https://www.coindesk.com/federal-reserve-scholars-blame-bitcoins-price-slump-to-the-futures/', timestamp='May 4, 2018 at 09:00 ', author='Wolfie Zhao', description='Cai Wensheng, a Chinese angel investor, says he bought 10,000 BTC after the price dropped earlier this year.\n'), article(title='Bitcoin Looks for Price Support After Failed $10K Crossover', link='https://www.coindesk.com/bitcoin-looks-for-price-support-after-failed-10k-crossover/', timestamp='May 3, 2018 at 10:00 ', author='Omkar Godbole', description='While equity bulls fear drops in May, it should not be a cause of worry for the bitcoin market, according to historical data.'), article(title='Bitcoin Sets Sights Above $10K After Bull Breakout', link='https://www.coindesk.com/bitcoin-sets-sights-10k-bull-breakout/', timestamp='May 3, 2018 at 03:18 ', author='Wolfie Zhao', description="Goldman Sachs is launching a new operation that will use the firm's own money to trade bitcoin-related contracts on behalf of its clients.")]

Categories

Resources