I'm trying to parse all the query string present in a page, so that using that query string I can navigate to specific page. code that I tried for doing this is as below
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import datetime
import dateutil.parser
import time
import pytz
"""python espncricinfo library module https://github.com/dwillis/python-espncricinfo """
from espncricinfo.match import Match
from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError
"""----time-zone-calculation----"""
time_zone = pytz.timezone("Asia/Kolkata")
datetime_today = datetime.datetime.now(time_zone)
datestring_today = datetime_today.strftime("%Y-%m-%d")
"""------URL of page to parse-------with a date of today-----"""
url = "http://www.espncricinfo.com/ci/engine/match/index.html?date=datestring_today"
"""eg. url = http://www.espncricinfo.com/ci/engine/match/index.html?date=2018-02-12"""
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
""""------parsing for matchno------"""
match_no = [x['href'].split('/',4)[4].split('.')[0] for x in
soup.findAll('a', href=True, text='Scorecard')]
for p in match_no:
""" where p is a match no, e.g p = '1122282'"""
m = Match(p)
m.latest_batting
print(m.latest_batting)
when I print match_no I get output:
['8890/scorecard/1118760/andhra-vs-tamil-nadu-group-c-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118743/assam-vs-odisha-group-a-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118745/bengal-vs-delhi-group-b-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118763/chhattisgarh-vs-vidarbha-group-d-vijay-hazare-trophy-2017-18/']
this page(http://www.espncricinfo.com/ci/engine/match/index.html?date=datestring_today") contains all the match_no of games happening on that day, I want to trim this to get match_no which is 7 digit number[1118743,1118743.1118745....], how can I do this? SO using that match_no I can pass it to the Match() so I get details of particular match which happening at that day.
PS if no match is going on the new day then match_no returns none.
First, your code is very hard to read. You need to let your code breathe and make it appealing for others to read it.
Second, what is causing issue is probably this line:
match_no = [x['href'].split('/',4)[4].split('.')[0] for x in soup.findAll('a', href=True, text='Scorecard')]
It is hard to read too. There are far more better and readable ways of parsing match id from URL.
Here is example of what should be working. I did take provisional date for matches:
import re
import pytz
import requests
import datetime
from bs4 import BeautifulSoup
from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError
from espncricinfo.match import Match
"""python espncricinfo library module https://github.com/dwillis/python-espncricinfo """
# from espncricinfo.match import Match
def get_match_id(link):
match_id = re.search(r'([0-9]{7})', link)
if match_id is None:
return None
return match_id.group()
# ----time-zone-calculation----
time_zone = pytz.timezone("Asia/Kolkata")
datetime_today = datetime.datetime.now(time_zone)
datestring_today = datetime_today.strftime("%Y-%m-%d")
# ------URL of page to parse-------with a date of today-----
url = "http://www.espncricinfo.com/ci/engine/match/index.html?date=datestring_today"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
spans = soup.findAll('span', {"class": "match-no"})
matches_ids = []
for s in spans:
for a in s.findAll('a', href=lambda href: 'scorecard' in href):
match_id = get_match_id(a['href'])
if match_id is None:
continue
matches_ids.append(match_id)
# ------parsing for matchno------
for p in matches_ids:
# where p is a match no, e.g p = '1122282'
m = Match(p)
m.latest_batting
print(m.latest_batting)
Now, I didn't have every lib that you are using here, but this should give you an idea of how to do it.
Once again, my advice is that empty lines are your friends. They are reader's friends for sure. Make your code 'breathe'.
Related
I am writing a script that will scrape a newsletter for URLs. There are some URLs in the newsletter that are irrelevant (e.g. links to articles, mailto links, social links, etc.). I added some logic to remove those links, but for some reason not all of them are being removed. Here is my code:
from os import remove
from turtle import clear
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
termSheetLinks = []
for companyURL in htmlParser.select("table#templateBody p > a"):
termSheetLinks.append(companyURL.get('href'))
for link in termSheetLinks:
if "fortune.com" in link in termSheetLinks:
termSheetLinks.remove(link)
if "forbes.com" in link in termSheetLinks:
termSheetLinks.remove(link)
if "twitter.com" in link in termSheetLinks:
termSheetLinks.remove(link)
print(termSheetLinks)
When I ran it most recently, this was my output, despite trying to remove all links containing "fortune.com":
['https://fortune.com/company/blackstone-group?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://fortune.com/company/tpg?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://casproviders.org/asd-guidelines/', 'https://fortune.com/company/carlyle-group?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://ir.carlyle.com/static-files/433abb19-8207-4632-b173-9606698642e5', 'mailto:termsheet#fortune.com', 'https://www.afresh.com/', 'https://www.geopagos.com/', 'https://montana-renewables.com/', 'https://descarteslabs.com/', 'https://www.dealer-pay.com/', 'https://www.sequeldm.com/', 'https://pueblo-mechanical.com/', 'https://dealcloud.com/future-proof-your-firm/', 'https://apartmentdata.com/', 'https://www.irobot.com/', 'https://www.martin-bencher.com/', 'https://cell-matters.com/', 'https://www.lever.co/', 'https://www.sigulerguff.com/']
Any help would be greatly appreciated!
It do not need a regex in my opinion - Instead of removing the urls, append only those to a list that do not contain your substrings, eg with a list comprehension:
[companyURL.get('href') for companyURL in htmlParser.select("table#templateBody p > a") if not any(x in companyURL.get('href') for x in ["fortune.com","forbes.com","twitter.com"])]
Example
from bs4 import BeautifulSoup
import requests
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
myList = ["fortune.com","forbes.com","twitter.com"]
[companyURL.get('href') for companyURL in htmlParser.select("table#templateBody p > a")
if not any(x in companyURL.get('href') for x in myList)]
Output
['https://casproviders.org/asd-guidelines/',
'https://ir.carlyle.com/static-files/433abb19-8207-4632-b173-9606698642e5',
'https://www.afresh.com/',
'https://www.geopagos.com/',
'https://montana-renewables.com/',
'https://descarteslabs.com/',
'https://www.dealer-pay.com/',
'https://www.sequeldm.com/',
'https://pueblo-mechanical.com/',
'https://dealcloud.com/future-proof-your-firm/',
'https://apartmentdata.com/',
'https://www.irobot.com/',
'https://www.martin-bencher.com/',
'https://cell-matters.com/',
'https://www.lever.co/',
'https://www.sigulerguff.com/']
Removing the links after the for iterator will not skip any entry.
from os import remove
from turtle import clear
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
termSheetLinks = []
for companyURL in htmlParser.select("table#templateBody p > a"):
termSheetLinks.append(companyURL.get('href'))
lRemove = []
for link in termSheetLinks:
if "fortune.com" in link:
lRemove.append(link)
if "forbes.com" in link:
lRemove.append(link)
if "twitter.com" in link:
lRemove.append(link)
for l in lRemove:
termSheetLinks.remove(l)
print(termSheetLinks)
Python code:
url = 'https://www.basketball-reference.com/players/'
initial = list(string.ascii_lowercase)
initial_url = [url + i for i in initial]
html_initial = [urllib.request.urlopen(i).read() for i in initial_url]
soup_initial = [BeautifulSoup(i, 'html.parser') for i in html_initial]
tags_initial = [i('a') for i in soup_initial]
print(tags_initial[0][50])
Results example:
Shareef Abdur-Rahim
From the example above, I want to extract the name of the players which is 'Shareef Abdur-Rahim', but I want to do it for all the tags_initial lists,
Does anyone have an idea?
Could you modify your post by adding your code so that we can help you better?
Maybe that could help you :
name = soup.findAll(YOUR_SELECTOR)[0].string
UPDATE
import re
import string
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.basketball-reference.com/players/'
# Alphabet
initial = list(string.ascii_lowercase)
datas = []
# URLS
urls = [url + i for i in initial]
for url in urls:
# Soup Object
soup = BeautifulSoup(urlopen(url), 'html.parser')
# Players link
url_links = soup.findAll("a", href=re.compile("players"))
for link in url_links:
# Player name
datas.append(link.string)
print("datas : ", datas)
Then, "datas" contains all the names of the players, but I advise you to do a little processing afterwards to remove some erroneous information like "..." or perhaps duplicates
There are probably better ways but I'd do it like this:
html = "a href=\"/teams/LAL/2021.html\">Los Angeles Lakers</a"
index = html.find("a href")
index = html.find(">", index) + 1
index_end = html.find("<", index)
print(html[index:index_end])
If you're using a scraper library it probably has a similar function built-in.
I want to check if any of the excluded sites show up. I can get it to work with just one site, but as soon as I make it a list, it errors at if donts in thingy:
TypeError: 'in ' requires string as left operand, not tuple"
This is my code:
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if donts in thingy:
pass
else:
print (thingy)
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if thingy in donts :
print (thingy)
else:
pass
Judge: string in tuple
The crux of your problem is how you're searching your excluded list:
excluded = ("a", "b", "c")
links = ["a", "d", "e"]
for site in links:
if site not in excluded: # We want to know if the site is in the excluded list
print(f"Site not excluded: {site}")
Reverse the order of your elements and this should work fine. I've inverted your logic here so you can skip the unnecessary pass.
As a side note, this is one reason clear variable names can help - they will help you reason about what the logic should be doing. Especially in Python where ergonomics like in exist, this is very useful.
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if any(d in thingy for d in donts):
pass
else:
print (thingy)
The script used to work, but no longer and I can't figure out why. I am trying to go to the link and extract/print the religion field. Using firebug, the religion field entry is within the 'tbody' then 'td' tag-structure. But now the script find "none" when searching for these tags. And I also look at the lxml by 'print Soup_FamSearch' and I couldn't see any 'tbody' and 'td' tags appeared on firebug.
Please let me know what I am missing?
import urllib2
import re
import csv
from bs4 import BeautifulSoup
import time
from unicodedata import normalize
FamSearchURL = 'https://familysearch.org/pal:/MM9.1.1/KH21-211'
OpenFamSearchURL = urllib2.urlopen(FamSearchURL)
Soup_FamSearch = BeautifulSoup(OpenFamSearchURL, 'lxml')
OpenFamSearchURL.close()
tbodyTags = Soup_FamSearch.find('tbody')
trTags = tbodyTags.find_all('tr', class_='result-item ')
for trTags in trTags:
tdTags_label = trTag.find('td', class_='result-label ')
if tdTags_label:
tdTags_label_string = tdTags_label.get_text(strip=True)
if tdTags_label_string == 'Religion: ':
print trTags.find('td', class_='result-value ')
Find the Religion: label by text and get the next td sibling:
soup.find(text='Religion:').parent.find_next_sibling('td').get_text(strip=True)
Demo:
>>> import requests
>>> from bs4 import BeautifulSoup
>>>
>>> response = requests.get('https://familysearch.org/pal:/MM9.1.1/KH21-211')
>>> soup = BeautifulSoup(response.content, 'lxml')
>>>
>>> soup.find(text='Religion:').parent.find_next_sibling('td').get_text(strip=True)
Methodist
Then, you can make a nice reusable function and reuse:
def get_field_value(soup, field):
return soup.find(text='%s:' % field).parent.find_next_sibling('td').get_text(strip=True)
print get_field_value(soup, 'Religion')
print get_field_value(soup, 'Nationality')
print get_field_value(soup, 'Birthplace')
Prints:
Methodist
Canadian
Ontario
I would like to estimate the impact of the news on the Dow Jones quotes. For this, I wrote the Python html parser, using the beutifullsoup library. I extract an article and store it in XML file for the further analysis using NLTK library. How can I increase the speed of parsing? The code below does the required task, but in a very slow manner.
Here is the code of the html parser:
import urllib2
import re
import xml.etree.cElementTree as ET
import nltk
from bs4 import BeautifulSoup
from datetime import date
from dateutil.rrule import rrule, DAILY
from nltk.corpus import stopwords
from collections import defaultdict
def main_parser():
#starting date
a = date(2014, 3, 27)
#ending date
b = date(2014, 3, 27)
articles = ET.Element("articles")
f = open('~/Documents/test.xml', 'w')
#loop through the links and per each link extract the text of the article, store the latter at xml file
for dt in rrule(DAILY, dtstart=a, until=b):
url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime("%d") + ".html"
page = urllib2.urlopen(url)
#use html5lib ??? possibility to use another parser
soup = BeautifulSoup(page.read(), "html5lib")
article_date = ET.SubElement(articles, "article_date")
article_date.text = str(dt)
for links in soup.find_all("div", "headlineMed"):
anchor_tag = links.a
if not 'video' in anchor_tag['href']:
try:
article_time = ET.SubElement(article_date, "article_time")
article_time.text = str(links.text[-11:])
article_header = ET.SubElement(article_time, "article_name")
article_header.text = str(anchor_tag.text)
article_link = ET.SubElement(article_time, "article_link")
article_link.text = str(anchor_tag['href']).encode('utf-8')
try:
article_text = ET.SubElement(article_time, "article_text")
#get text and remove all stop words
article_text.text = str(remove_stop_words(extract_article(anchor_tag['href']))).encode('ascii','ignore')
except Exception:
pass
except Exception:
pass
tree = ET.ElementTree(articles)
tree.write("~/Documents/test.xml","utf-8")
#getting the article text from the spicific url
def extract_article(url):
plain_text = ""
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html, "html5lib")
tag = soup.find_all("p")
#replace all html tags
plain_text = re.sub(r'<p>|</p>|[|]|<span class=.*</span>|<a href=.*</a>', "", str(tag))
plain_text = plain_text.replace(", ,", "")
return str(plain_text)
def remove_stop_words(text):
text=nltk.word_tokenize(text)
filtered_words = [w for w in text if not w in stopwords.words('english')]
return ' '.join(filtered_words)
Several fixes can be applied (without changing modules you are currently using):
use lxml parser instead of html5lib - it is much much (and 3 more muches) faster
parse only a part of document with SoupStrainer (note that html5lib doesn't support SoupStrainer - it will always parse the whole document slowly)
Here's how the code would look like after the changes. Brief performance test shows at least 3x improvement:
import urllib2
import xml.etree.cElementTree as ET
from datetime import date
from bs4 import SoupStrainer, BeautifulSoup
import nltk
from dateutil.rrule import rrule, DAILY
from nltk.corpus import stopwords
def main_parser():
a = b = date(2014, 3, 27)
articles = ET.Element("articles")
for dt in rrule(DAILY, dtstart=a, until=b):
url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime(
"%d") + ".html"
links = SoupStrainer("div", "headlineMed")
soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=links)
article_date = ET.SubElement(articles, "article_date")
article_date.text = str(dt)
for link in soup.find_all('a'):
if not 'video' in link['href']:
try:
article_time = ET.SubElement(article_date, "article_time")
article_time.text = str(link.text[-11:])
article_header = ET.SubElement(article_time, "article_name")
article_header.text = str(link.text)
article_link = ET.SubElement(article_time, "article_link")
article_link.text = str(link['href']).encode('utf-8')
try:
article_text = ET.SubElement(article_time, "article_text")
article_text.text = str(remove_stop_words(extract_article(link['href']))).encode('ascii', 'ignore')
except Exception:
pass
except Exception:
pass
tree = ET.ElementTree(articles)
tree.write("~/Documents/test.xml", "utf-8")
def extract_article(url):
paragraphs = SoupStrainer('p')
soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=paragraphs)
return soup.text
def remove_stop_words(text):
text = nltk.word_tokenize(text)
filtered_words = [w for w in text if not w in stopwords.words('english')]
return ' '.join(filtered_words)
Note that I've removed the regular expression processing from extract_article() - looks like you can just get the whole text from the p tags.
I might have introduced some problems - please check if everything is correct.
Another solution would be to use lxml for everything from parsing (replace beautifulSoup) to creating the xml (replace xml.etree.ElementTree).
Another solution (definitely the fastest) would be to switch to Scrapy web-scraping web-framework.
It is simple and very fast. There are all kind of batteries, you can imagine, included. For example there are link extractors, XML exporters, database pipelines etc. Worth looking.
Hope that helps.
You want to pick the best parser.
We benchmark most of the parser / platform when building: http://serpapi.com
Here is a full article on Medium:
https://medium.com/#vikoky/fastest-html-parser-available-now-f677a68b81dd