BS4 http request gives error when I edit URL in code - python

Trigger warning: I'm a noob
import requests
from bs4 import BeautifulSoup
from termcolor import colored
with open('ign.txt') as f:
namesList = f.readlines()
print("Accounts found: ", namesList) # Opening file and reading it
for x in namesList:
url = "https://oldschool.runeclan.com/user/" + x # Adding username from file to URL
print(url)
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(class_="xp_tracker_gain_today").get_text()
title2 = soup.find(class_="xp_tracker_gain altcolor xp_tracker_pos").get_text()
title3 = soup.find(class_="xp_tracker_next").get_text() # Finding right information on site
print(colored("Exp gain today: " + title, 'green'))
print(colored("Exp gain yesterday: " + title2, 'green'))
print(colored(title3, 'green')) # Printing data found
When I modify my URL url = "https://oldschool.runeclan.com/user/" + x
I get the following error message
AttributeError: 'NoneType' object has no attribute 'get_text'
Which should mean there is nothing found.
This is the output from the upper half of the code Accounts found: ['mausie\n', 'mr+stevieyh\n', 'Douwe\n', 'Henk\n']https://oldschool.runeclan.com/user/mausie
So the link I made is correct
When I don't try to modify the URL and do let's say url = "https://oldschool.runeclan.com/user/myusername"
It gives no error. However I want to loop trough my file to check more then 1 username.
Does anyone know how to fix this?

Here is the problem. When you read from the text file...an extra \n for newline is being added to your URL. That is why requests returns a 404 page not found error. A good idea is to check by using print(repr(url)) instead of print(url). That will show you the extra '\n . To fix this we just do url=url.rstrip() and Voila..It works.
import requests
from bs4 import BeautifulSoup
from termcolor import colored
with open('Sample', encoding='utf-8-sig') as f:
namesList = f.readlines()
print("Accounts found: ", namesList) # Opening file and reading it
for x in namesList:
url = "https://oldschool.runeclan.com/user/" + x # Adding username from file to URL print(repr(url))
print(repr(url))
url = url.rstrip()
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(class_="xp_tracker_gain_today").text
title2 = soup.find(class_="xp_tracker_gain altcolor xp_tracker_pos").text
title3 = soup.find(class_="xp_tracker_next").text # Finding right information on site
print(colored("Exp gain today: " + title, 'green'))
print(colored("Exp gain yesterday: " + title2, 'green'))
print(colored(title3, 'green')) # Printing data found
Output:-
Accounts found: ['mausie\n', 'mr+stevieyh\n']
'https://oldschool.runeclan.com/user/mausie\n'
Exp gain today: 233,508
Exp gain yesterday: 469,011
Last tracked: 1 minute agoNext track available: now
'https://oldschool.runeclan.com/user/mr+stevieyh\n'
Exp gain today: 129,203
Exp gain yesterday: 730,434
Last tracked: 1 minute agoNext track available: now
That extra \n was the problem all along.

Related

Why am I not seeing any results in my output from extracting indeed data using python

I am trying to run this code in idle 3.10.6 and I am not seeing any kind of data that should be extracted from Indeed. All this data should be in the output when I run it but it isn't. Below is the input statement
#Indeed data
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko"}
url = "https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}"
r = requests.get(url,headers)
soup = BeautifulSoup(r.content, "html.parser")
return soup
def transform(soup):
divs = soup.find_all("div", class_ = "jobsearch-SerpJobCard")
for item in divs:
title = item.find ("a").text.strip()
company = item.find("span", class_="company").text.strip()
try:
salary = item.find("span", class_ = "salarytext").text.strip()
finally:
salary = ""
summary = item.find("div",{"class":"summary"}).text.strip().replace("\n","")
job = {
"title":title,
"company":company,
'salary':salary,
"summary":summary
}
joblist.append(job)
joblist = []
for i in range(0,40,10):
print(f'Getting page, {i}')
c = extract(10)
transform(c)
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('jobs.csv')
Here is the output I get
Getting page, 0
Getting page, 10
Getting page, 20
Getting page, 30
Empty DataFrame
Columns: []
Index: []
Why is this going on and what should I do to get that extracted data from indeed? What I am trying to get is the jobtitle,company,salary, and summary information. Any help would be greatly apprieciated.
The URL string includes {page}, bit it's not an f-string, so it's not being interpolated, and the URL you are fetching is:
https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}
That returns an error page.
So you should add an f before opening quote when you set url.
Also, you are calling extract(10) each time, instead of extract(i).
This is the correct way of using url
url = "https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}".format(page=page)
r = requests.get(url,headers)
here r.status_code gives an error 403 which means the request is forbidden.The site will block your request from fullfilling.use indeed job search Api

Web scraping review section on agoda website with post request

need help with retrieving review section from agoda website.
from bs4 import BeautifulSoup
import requests
import json
from tqdm import tqdm
filename = "hotel.csv"
f = open(filename, "w", encoding="utf-8")
headers = "title, rating, review\n"
f.write(headers)
api_url = "https://www.agoda.com/api/cronos/property/review/ReviewComments"
headers = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
# for loop for multiple page scrap
for x in tqdm(range(1,10)):
post_data = {"hotelId":"2252947",
"providerId":"332",
"demographicId":"0",
"page":str(x),
"pageSize":"20",
"sorting":"7",
"providerIds":[332],
"isReviewPage":"false",
"isCrawlablePage":"true",
"filters":{"language":[],"room":[]},
"searchKeyword":"",
"searchFilters":[]}
html = requests.post(api_url, data=post_data)
values = html.text
soup = BeautifulSoup(values, "html.parser")
hotels = soup.find_all("div", {"class": "review-comment"})
for hotel in hotels:
try:
rating = hotel.find("div", {"class":"Review-comment-leftScore"}).text
title = hotel.find("p", {"class":"Review-comment-bodyTitle"}).text
review = hotel.find("p", {"class":"Review-comment-bodyText"}).text
f.write(title + ", "+ rating + ", " + review + "\n")
except TypeError:
continue
f.close()
post data i get from firefox network monitor when i change the page on the review section.
the hotel: Hotel Page
tried the json method but i dont understand
I think your api endpoint or data is wrong. 'cause if you try just to print, you get <Response [415]>.
Should be 200.
html.json()
{'type': 'https://tools.ietf.org/html/rfc7231#section-6.5.13', 'title': 'Unsupported Media Type', 'status': 415, 'traceId': '00-68f23e7f0431e7bffae420112667ed1b-6306a38dd716894d-00'}

How to extract movie genre from Metacritic website using BeautifulSoup

I want to do this for the top 500 movies of Metacritic found at https://www.metacritic.com/browse/movies/score/metascore/all/filtered?sort=desc
Each genre will be extracted from a detail link like this(for the first one): https://www.metacritic.com/movie/citizen-kane-1941/details
Just need some help on the extraction of the genre part from the HTML from the above-detailed link
My get_genre function (but I get an attribute error)
def get_genre(detail_link):
detail_page = requests.get(detail_link, headers = headers)
detail_soup = BeautifulSoup(detail_page.content, "html.parser")
try:
#time.sleep(1)
table=detail_soup.find('table',class_='details',summary=movie_name +" Details and Credits")
#print(table)
gen_line1=table.find('tr',class_='genres')
#print(gen_line1)
gen_line=gen_line1.find('td',class_='data')
#print(gen_line)
except:
time.sleep(1)
year=detail_soup.find(class_='release_date')
year=year.findAll('span')[-1]
year=year.get_text()
year=year.split()[-1]
table=detail_soup.find('table',class_='details',summary=movie_name +" ("+ year +")"+" Details and Credits")
#print(table)
gen_line1=table.find('tr',class_='genres')
#print(gen_line1)
gen_line=gen_line1.find('td',class_='data')
genres=[]
for line in gen_line:
genre = gen_line.get_text()
genres.append(genre.strip())
genres=list(set(genres))
genres=(str(genres).split())
return genres
you're too much focused on getting the table. just use what elements you're sure about. here's an example with select
from bs4 import BeautifulSoup
import requests
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_0) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/58.0.849.0 Safari/536.1'}
detail_link="https://www.metacritic.com/movie/citizen-kane-1941/details"
detail_page = requests.get(detail_link, headers = headers)
detail_soup = BeautifulSoup(detail_page.content, "html.parser")
genres=detail_soup.select('tr.genres td.data span')
print([genre.text for genre in genres])
>>> ['Drama', 'Mystery']

Python Youtube Web Scraper not working properly

So i built this small script that would give back a URL of any searched video on youtube. But after opening it up again turns out that the web scraping with youtube is not working out properly. As when printing soup it returns something completely different than from what can be seen with inspect element on Youtube. Can someone help me solve this...
Heres My Code:
import requests
from lxml import html
import webbrowser
from bs4 import BeautifulSoup
import time
import tkinter
from pytube import YouTube
headers= {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36"}
def video_finder():
word = input("Enter video title: ")
if ' ' in word:
new = word.replace(' ', '+')
print(new)
else:
pass
vid = requests.get('https://www.youtube.com/results?search_query={}'.format(new))
soup = BeautifulSoup(vid.text, features='lxml')
all_vids = soup.find_all('div', id_='contents')
print(all_vids)
video1st = all_vids[0]
a_Tag = video1st.find('a', class_="yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link", href=True)
Video_name = a_Tag.text
Video_id = a_Tag['href']
video_link = 'https://www.youtube.com' + Video_id
print(Video_name)
print(video_link)
Its not the best but ye... thank you
To get correct result from Youtube page, set User-Agent HTTP header to Googlebot, and use html.parser in BeautifulSoup.
For example:
import requests
from bs4 import BeautifulSoup
headers= {"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
def video_finder():
word = input("Enter video title: ")
params = {
'search_query': word
}
vid = requests.get('https://www.youtube.com/results', params=params, headers=headers)
soup = BeautifulSoup(vid.content, features='html.parser')
a_Tag = soup.find('a', class_="yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link", href=lambda h: h.startswith('/watch?'))
Video_name = a_Tag.text
Video_id = a_Tag['href']
video_link = 'https://www.youtube.com' + Video_id
print(Video_name)
print(video_link)
video_finder()
Prints:
Enter video title: sailor moon
Sailor Moon Opening (English) *HD*
https://www.youtube.com/watch?v=5txHGxJRwtQ

Want to scrape data using 10 different keywords in URL for 2 pages and write scraped data to csv using Python 3.6.2 and BS4

I have the code ready for one keyword and its working fine. Next problem is I want to do the scrape for 10 different keywords and save them in one csv file with the keyword name on column/row. I think we can give csv file as input and it picks keyword one by one and does scrape. Here is the code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?
rh=n%3A4772060031%2Ck%3Ahelmets+for+men&keywords=helmets+for+men&ie=UTF8"
#excluding page from base_url for further adding
res = []
for page in range(1,3):
request = requests.get(base_url + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Asin', 'Result'])
df.to_csv('hel.csv')
I made some sample keywords, replace on needed ones.
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ie=UTF8"
keywords_list = ['helmets for men', 'helmets for women']
keyword = 'helmets for men'
#excluding page from base_url for further adding
res = []
for page in range(1,3):
for keyword in keywords_list:
request = requests.get(base_url + '&keywords=' + requests.utils.quote(keyword) + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id'), keyword])
df = pd.DataFrame(data=res, columns=['Asin', 'Result', 'keyword'])
df.to_csv('hel.csv')

Categories

Resources