create list of tuples with download url + "href" - python

I'm trying to make a list of tuples, the first element being the download URL and the second being the file name from the URL string with below code:
import urllib
import requests
from bs4 import BeautifulSoup
import pandas as pd
import io
url = r"https://www.ers.usda.gov/data-products/livestock-meat-domestic-data"
my_bytes = urllib.request.urlopen(url)
my_bytes = my_bytes.read().decode("utf8")
parsed_html = BeautifulSoup(my_bytes, features = "lxml")
table_data = parsed_html.body.find('table', attrs = {'id':'data_table'})
download_url = "https://www.ers.usda.gov"
full_download_url = [tuple(download_url,i["href"]) for i in table_data.find_all('a')]
But I've been getting TypeError: must be str, not list all along and I'm not sure how to fix this, please help? Thanks!

This is what I needed:
import urllib
import requests
from bs4 import BeautifulSoup
import pandas as pd
import io
url = r"https://www.ers.usda.gov/data-products/livestock-meat-domestic-data"
my_bytes = urllib.request.urlopen(url)
my_bytes = my_bytes.read().decode("utf8")
parsed_html = BeautifulSoup(my_bytes, features = "lxml")
table_data = parsed_html.body.find('table', attrs = {'id':'data_table'})
download_url = "https://www.ers.usda.gov"
def convertTuple(tup):
str = ''
for item in tup:
str = str + item
return str
full_download_url = [convertTuple(tuple(download_url + i["href"])) for i in table_data.find_all('a')]
Thanks to Geeks for geeks and everyone trying to help :)

You are incorrectly accessing the download_url array index.
Python is interpreting your code as creating an array with one element [0] when i is
0 for example, and then trying to access the element ["href"] which is a string, not a valid index
If you specify download_url before accessing the indices it will work as expected
full_download_url = [(download_url, download_url[i]["href"]) for i in table_data.find_all('a')]

Related

Python: Get element next to href

Python code:
url = 'https://www.basketball-reference.com/players/'
initial = list(string.ascii_lowercase)
initial_url = [url + i for i in initial]
html_initial = [urllib.request.urlopen(i).read() for i in initial_url]
soup_initial = [BeautifulSoup(i, 'html.parser') for i in html_initial]
tags_initial = [i('a') for i in soup_initial]
print(tags_initial[0][50])
Results example:
Shareef Abdur-Rahim
From the example above, I want to extract the name of the players which is 'Shareef Abdur-Rahim', but I want to do it for all the tags_initial lists,
Does anyone have an idea?
Could you modify your post by adding your code so that we can help you better?
Maybe that could help you :
name = soup.findAll(YOUR_SELECTOR)[0].string
UPDATE
import re
import string
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.basketball-reference.com/players/'
# Alphabet
initial = list(string.ascii_lowercase)
datas = []
# URLS
urls = [url + i for i in initial]
for url in urls:
# Soup Object
soup = BeautifulSoup(urlopen(url), 'html.parser')
# Players link
url_links = soup.findAll("a", href=re.compile("players"))
for link in url_links:
# Player name
datas.append(link.string)
print("datas : ", datas)
Then, "datas" contains all the names of the players, but I advise you to do a little processing afterwards to remove some erroneous information like "..." or perhaps duplicates
There are probably better ways but I'd do it like this:
html = "a href=\"/teams/LAL/2021.html\">Los Angeles Lakers</a"
index = html.find("a href")
index = html.find(">", index) + 1
index_end = html.find("<", index)
print(html[index:index_end])
If you're using a scraper library it probably has a similar function built-in.

How i can get random object from list in Python

I have build a list which contains href from website and i wanna randomly select one of this link, how can i do that?
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles = a['href']
x = random.choice(articles)
print(x)
That code work, but selecting only random index from all of the objects
You're very close to the answer. You just need to do this:
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles.append(a['href'])
x = random.choice(articles)
print(x)
getItems()
The changes are:
We add each article to the articles array.
The random choice is now done after the loop, rather than inside the loop.

if string contains from list

I want to check if any of the excluded sites show up. I can get it to work with just one site, but as soon as I make it a list, it errors at if donts in thingy:
TypeError: 'in ' requires string as left operand, not tuple"
This is my code:
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if donts in thingy:
pass
else:
print (thingy)
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if thingy in donts :
print (thingy)
else:
pass
Judge: string in tuple
The crux of your problem is how you're searching your excluded list:
excluded = ("a", "b", "c")
links = ["a", "d", "e"]
for site in links:
if site not in excluded: # We want to know if the site is in the excluded list
print(f"Site not excluded: {site}")
Reverse the order of your elements and this should work fine. I've inverted your logic here so you can skip the unnecessary pass.
As a side note, this is one reason clear variable names can help - they will help you reason about what the logic should be doing. Especially in Python where ergonomics like in exist, this is very useful.
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if any(d in thingy for d in donts):
pass
else:
print (thingy)

How can I parse querystring from webpage?

I'm trying to parse all the query string present in a page, so that using that query string I can navigate to specific page. code that I tried for doing this is as below
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import datetime
import dateutil.parser
import time
import pytz
"""python espncricinfo library module https://github.com/dwillis/python-espncricinfo """
from espncricinfo.match import Match
from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError
"""----time-zone-calculation----"""
time_zone = pytz.timezone("Asia/Kolkata")
datetime_today = datetime.datetime.now(time_zone)
datestring_today = datetime_today.strftime("%Y-%m-%d")
"""------URL of page to parse-------with a date of today-----"""
url = "http://www.espncricinfo.com/ci/engine/match/index.html?date=datestring_today"
"""eg. url = http://www.espncricinfo.com/ci/engine/match/index.html?date=2018-02-12"""
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
""""------parsing for matchno------"""
match_no = [x['href'].split('/',4)[4].split('.')[0] for x in
soup.findAll('a', href=True, text='Scorecard')]
for p in match_no:
""" where p is a match no, e.g p = '1122282'"""
m = Match(p)
m.latest_batting
print(m.latest_batting)
when I print match_no I get output:
['8890/scorecard/1118760/andhra-vs-tamil-nadu-group-c-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118743/assam-vs-odisha-group-a-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118745/bengal-vs-delhi-group-b-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118763/chhattisgarh-vs-vidarbha-group-d-vijay-hazare-trophy-2017-18/']
this page(http://www.espncricinfo.com/ci/engine/match/index.html?date=datestring_today") contains all the match_no of games happening on that day, I want to trim this to get match_no which is 7 digit number[1118743,1118743.1118745....], how can I do this? SO using that match_no I can pass it to the Match() so I get details of particular match which happening at that day.
PS if no match is going on the new day then match_no returns none.
First, your code is very hard to read. You need to let your code breathe and make it appealing for others to read it.
Second, what is causing issue is probably this line:
match_no = [x['href'].split('/',4)[4].split('.')[0] for x in soup.findAll('a', href=True, text='Scorecard')]
It is hard to read too. There are far more better and readable ways of parsing match id from URL.
Here is example of what should be working. I did take provisional date for matches:
import re
import pytz
import requests
import datetime
from bs4 import BeautifulSoup
from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError
from espncricinfo.match import Match
"""python espncricinfo library module https://github.com/dwillis/python-espncricinfo """
# from espncricinfo.match import Match
def get_match_id(link):
match_id = re.search(r'([0-9]{7})', link)
if match_id is None:
return None
return match_id.group()
# ----time-zone-calculation----
time_zone = pytz.timezone("Asia/Kolkata")
datetime_today = datetime.datetime.now(time_zone)
datestring_today = datetime_today.strftime("%Y-%m-%d")
# ------URL of page to parse-------with a date of today-----
url = "http://www.espncricinfo.com/ci/engine/match/index.html?date=datestring_today"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
spans = soup.findAll('span', {"class": "match-no"})
matches_ids = []
for s in spans:
for a in s.findAll('a', href=lambda href: 'scorecard' in href):
match_id = get_match_id(a['href'])
if match_id is None:
continue
matches_ids.append(match_id)
# ------parsing for matchno------
for p in matches_ids:
# where p is a match no, e.g p = '1122282'
m = Match(p)
m.latest_batting
print(m.latest_batting)
Now, I didn't have every lib that you are using here, but this should give you an idea of how to do it.
Once again, my advice is that empty lines are your friends. They are reader's friends for sure. Make your code 'breathe'.

Request a html file

This link lets me get a random item from database. However, I would like to automatically retrieve items using Python. Here's my code:
import sys
from urllib.parse import urlencode
from urllib.request import urlopen
# parameters
data = {}
data["query"] = "reviewd:yes+AND+organism:9606"
data["random"] = "yes"
url_values = urlencode(data)
url = "http://www.uniprot.org/uniprot/"
full_url = url + '?' + url_values
data = urlopen(full_url)
out = open("1.html", 'w')
out.write(str(data.read()))
However, I cannot get the desired page. Anyone knows what's wrong with my code? I'm using Python 3.x.
You have several issues:
reviewd is misspelled, it should be reviewed
The base url needs to have /uniprot/ at the end
You need to use space instead of + in your query string
Here is what that would look like:
import sys
from urllib.parse import urlencode
from urllib.request import urlopen
# parameters
data = {}
data["query"] = "reviewed:yes AND organism:9606"
data["random"] = "yes"
url_values = urlencode(data)
url = "http://www.uniprot.org/uniprot/"
full_url = url + '?' + url_values
data = urlopen(full_url)
out = open("1.html", 'w')
out.write(str(data.read()))
This produces the following URL:
http://www.uniprot.org/uniprot/?query=reviewed%3Ayes+AND+organism%3A9606&random=yes

Categories

Resources