I have the following code (I have X'd out my user name and password). The code runs fine however how do I know if its using the proxy IP address to complete this scrape? I am teaching myself from scratch how to do this so any help is appreciated as I know there is so much to learn!
import requests
from bs4 import BeautifulSoup
from random import choice
import pandas as pd
def start_requests(self):
for url in self.start_urls:
return Request(url=url, callback=self.parse,
headers={"User-Agent": "My user agent"},
meta={"proxy": "http://username:/password#p.proxyegg.com:8080"})
def getProduct_Data(tag):
url = f'https://www.whiteline.com.au/product_detail4.php?part_number={tag}'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
Product_Data = {
'sku': soup.find("div",{"class":"head2BR"}).text,
'product': soup.find("div",{"style":"float:left; margin-left:24px; margin-top:8px; margin-right:14px;"}).text,
'description': soup.find("div",{"style":"min-height:80px; max-height:224px; overflow-y: scroll;"}).text.strip(),
'price': soup.find("div",{"class":"float_Q"}).text.strip(),
'features': soup.find("div",{"class":"grey1"}).text.strip(),
'contents': soup.find("div",{"style":"max-height:28px; overflow-y:scroll;"}).text.strip(),
'compatiblity': soup.find("div",{"style":"width:960px; margin:auto; padding-top:18px;"}).text.strip(),
}
url_list.append(Product_Data)
return
getProduct_Data('KBR15')
getProduct_Data('W13374')
getProduct_Data('BMR98')
getProduct_Data('W51210')
getProduct_Data('W51211')
getProduct_Data('W92498')
getProduct_Data('W93404')
getProduct_Data('W92899')
getProduct_Data('W51710')
getProduct_Data('W53277')
getProduct_Data('W53379')
getProduct_Data('BSK010M')
getProduct_Data('KSB568')
df = pd.DataFrame(url_list)
df.to_csv('whitelinefull.csv')
print('Fin.')
One way to test the efficacy of your proxy is to scrape an IP lookup/location site and check that the IP the site detects via the request is the same as the one your proxy is using:
import requests, re
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://ip.me/', proxies=proxies, headers=headers).text, 'html.parser')
assert d.select_one('input[name="ip"]')['value'] == re.findall('(?:\d+\.)+\d+', proxies['https'])[0]
The detected IP from https://ip.me/ can be found in d.select_one('input[name="ip"]')['value'].
It may be possible that instead of an explicit IP in your proxies dictionary, a non-IP URL with login credentials might be used instead (which appears to be the case from your post). To check that the proxy provider is working correctly, you can send a request to https://ip.me/ both with and without the proxies and compare the results:
d = soup(requests.get('https://ip.me/', proxies=proxies, headers=headers).text, 'html.parser')
d1 = soup(requests.get('https://ip.me/', headers=headers).text, 'html.parser')
assert d.select_one('input[name="ip"]')['value'] == d1.select_one('input[name="ip"]')['value'], "proxy does not seem to be working"
Related
Based on my question here I have some further question with requests on website finance.yahoo.com.
My request without User-Agent request gives me the html code I want to collect some data from the website.
The call with 'ALB' as parameter works fine, I get the requested data:
import bs4 as bs
import requests
def yahoo_summary_stats(stock):
response = requests.get(f"https://finance.yahoo.com/quote/{stock}")
#response = requests.get(f"https://finance.yahoo.com/quote/{stock}", headers={'User-Agent': 'Custom user agent'})
soup = bs.BeautifulSoup(response.text, 'lxml')
table = soup.find('p', {'class': 'D(ib) Va(t)'})
sector = table.findAll('span')[1].text
industry = table.findAll('span')[3].text
print(f"{stock}: {sector}, {industry}")
return sector, industry
web.yahoo_summary_stats('ALB')
Output:
ALB: Basic Materials, Specialty Chemicals
The call yahoo_summary_stats('AEE') doesnt work this way, so I need to acitivate headers to request the site with success.
But now with parameterheaders={'User-Agent': 'Custom user agent'} the code doesn't work and he cannot find the paragraph p with class 'D(ib) Va(t)'.
How can I solve this problem?
I think you are fetching the wrong URL
response = requests.get(f"https://finance.yahoo.com/quote/{stock}/profile?p={stock}", headers={'User-Agent': 'Custom user agent'})
Changing to above URL along with user-agent would solve it.
This page uses JavaScript to display information but requests,BeautifulSoup can't run JavaScript.
But checking page in web browser without JavaScript I see this information on subpage Profile.
"https://finance.yahoo.com/quote/{stock}/profile?p={stock}"
Code can get it for both stock from this page. But it needs User-Agent from real browser (or at least short version 'Mozilla/5.0'
import bs4 as bs
import requests
def yahoo_summary_stats(stock):
url = f"https://finance.yahoo.com/quote/{stock}/profile?p={stock}"
headers = {'User-Agent': 'Mozilla/5.0'}
print('url:', url)
response = requests.get(url, headers=headers)
soup = bs.BeautifulSoup(response.text, 'lxml')
table = soup.find('p', {'class': 'D(ib) Va(t)'})
sector = table.findAll('span')[1].text
industry = table.findAll('span')[3].text
print(f"{stock}: {sector}, {industry}")
return sector, industry
# --- main ---
result = yahoo_summary_stats('ALB')
print('result:', result)
result = yahoo_summary_stats('AEE')
print('result:', result)
Result:
url: https://finance.yahoo.com/quote/ALB/profile?p=ALB
ALB: Basic Materials, Specialty Chemicals
result: ('Basic Materials', 'Specialty Chemicals')
url: https://finance.yahoo.com/quote/AEE/profile?p=AEE
AEE: Utilities, Utilities—Regulated Electric
result: ('Utilities', 'Utilities—Regulated Electric')
I am going to extract posts in a forum, named positive wellbeing during isolation" in HealthUnlocked.com
I can extract posts without login, but I cannot extract posts with logging. I used " url = 'https://solaris.healthunlocked.com/posts/positivewellbeing/popular?pageNumber={0}'.format(page)" to extract pots, but I don't know how I can connect it to login as the URL is in JSON format.
I would appreciate it if you could help me.
import requests, json
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
url = "https://healthunlocked.com/private/programs/subscribed?user-id=1290456"
payload = {
"username" : "my username goes here",
"Password" : "my password goes hereh"
}
s= requests.Session()
p= s.post(url, data = payload)
headers = {"user-agent": "Mozilla/5.0"}
pages =2
data = []
listtitles=[]
listpost=[]
listreplies=[]
listpostID=[]
listauthorID=[]
listauthorName=[]
for page in range(1,pages):
url = 'https://solaris.healthunlocked.com/posts/positivewellbeing/popular?pageNumber=
{0}'.format(page)
r = requests.get(url,headers=headers)
posts = json.loads(r.text)
for post in posts:
sleep(3.5)
listtitles.append(post['title'])
listreplies.append(post ["totalResponses"])
listpostID.append(post["postId"])
listauthorID.append(post ["author"]["userId"])
listauthorName.append(post ["author"]["username"])
url = 'https://healthunlocked.com/positivewellbeing/posts/{0}'.format(post['postId'])
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
listpost.append(soup.select_one('div.post-body').get_text('|', strip=True))
## save to CSV
df=pd.DataFrame(list(zip(*
[listpostID,listtitles,listpost,listreplies,listauthorID,listauthorName]))).add_prefix('Col')
df.to_csv('out1.csv',index=False)
print(df)
sleep(2)
For most websites, you have to first get a token by logging in. Most of the time, this is a cookie. Then, in authorized requests, you can send that cookie along. Open the network tab in developer tools and then log in with your username and password. You'll be able to see how the request is formatted and where it is too. From there, just try to replicate it in your code.
So I needed to login to a website as I need to do an action that requires logging in first.
Here's my code:
import requests from bs4 import BeautifulSoup
logdata = {'username': 'xxx', 'password': 'xxx'}
url = 'https://darkside-ro.com/?module=account&action=login&return_url='
with requests.Session() as s:
r = [s.post](https://s.post)(url, data=logdata)
html = r.text soup = BeautifulSoup(html, "html.parser")
print(soup.title.get_text())
it gives me the title of when you're not logged in :(
I'm not sure why did I flagged this as duplicate, sorry.
Okay, so I created a dummy account and tried logging in - I noticed that when I submit the form, the following data are sent to https://darkside-ro.com/?module=account&action=login&return_url=.
So to fix your issue, you have to include a server in your logdata dictionary.
import requests
from bs4 import BeautifulSoup
logdata = {
'username': 'abc123456',
'password': 'abc123456',
'server': 'Darkside RO'
}
url = 'https://darkside-ro.com/?module=account&action=login&return_url='
with requests.Session() as s:
r = s.post(url, data=logdata)
html = r.text
soup = BeautifulSoup(html, 'html.parser')
print(soup.title.get_text())
Running the code above will print
Darkside RO - The Rise of Skywalker
PS: When you do this things again, it would be a good idea to check for hidden inputs in the form by inspecting the elements. On the site above, it has
<input type="hidden" name="server" value="Darkside RO">
I have to scrape data from a website, that requires login.
This is the current code I am using, but I am not getting the logged in page's HTML.
from requests import Session
from bs4 import BeautifulSoup as bs
with Session() as s:
site = s.get("https://www.valueresearchonline.com/membership/getin.asp?ref=%2Fport_v1%2Fdefault%2Easp%3Fselv%3D8%26poid%3D1443091")
bs_content = bs(site.content, "html.parser")
token = bs_content.find("input", {"name":"ref"})["value"]
login_data = {"username":"<username>","password":"<password>","ref":token}
p = s.post("https://www.valueresearchonline.com/membership/getin.asp?ref=%2Fport_v1%2Fdefault%2Easp%3Fselv%3D8%26poid%3D1443091",login_data)
print(p.text)
The HTML I am getting is the same as the one as before logging in. Also, I am not sure whether the token part was required for this site, so I have tried both by once using it and once not using it, but I both cases the result was the same as I explained.
Add one more parameter in the
p = s.post("https://www.valueresearchonline.com/membership/getin.asp?ref=%2Fport_v1%2Fdefault%2Easp%3Fselv%3D8%26poid%3D1443091",login_data)
as allow_redirects=True and change the URL to https://www.valueresearchonline.com/registration/loginprocess.asp:
p = s.post("p = s.post("https://www.valueresearchonline.com/registration/loginprocess.asp", data=login_data, allow_redirects=True)", data=login_data, allow_redirects=True)
Check if this works for you.
Put your email and password within the value of payload['username'] and payload['password'] and I suppose it will log you in.
import requests
from bs4 import BeautifulSoup
url = "https://www.valueresearchonline.com/membership/getin.asp"
post_url = "https://www.valueresearchonline.com/registration/loginprocess.asp"
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0'
site = s.get(url)
soup = BeautifulSoup(site.text, "lxml")
payload = {item['name']:item.get('value','') for item in soup.select('input[name]')}
payload['username'] = 'your email'
payload['password'] = 'your password'
p = s.post(post_url,data=payload)
print(p.text)
I am trying to get the output page once we submit input on this page. http://batblob.com
Below is my code but it is giving me the html code of same page rather than of output page.
I am on Python 2.7, Please suggest me how should I get there.
Thank you in advance.
import requests
import sys
from BeautifulSoup import BeautifulSoup
url = 'http://batblob.com'
session = requests.Session()
form = {
'bitadd':'anybitcoinaddress',
'submit': 'submit',
}
r = requests.post( url, data=form)
response = session.get(url)
html = response.content
soup = BeautifulSoup(html)
print soup
In this case the data you want is already in your r variable. I would assume the following will get what you are after.
import requests
from BeautifulSoup import BeautifulSoup
url = 'http://batblob.com'
form = {
'bitadd':'anybitcoinaddress',
'submit': 'submit',
}
# Post form data and parse response
response = requests.post(url, data=form)
soup = BeautifulSoup(response.content)
print(soup)