I am trying to scrape 3 products on a website. I have attached all 3 into a .txt file.
When running, for some reason I get a "Bad Request" on the first 2 URLs and it successfully scraps the 3rd URL and sends a webhook to discord. No matter what order I put the URLs it only seems to do the last URL, whether I do it with proxies or not. (I added proxies as I want to do a lot more URLs than just the 3.)
Here is my current code:
import requests
import lxml.html
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from dhooks import Webhook, Embed
import random
ua = UserAgent()
header = {'User-Agent':ua.chrome}
# Proxies
proxy_list = []
for line in open('proxies.txt', 'r'):
line = line.replace('\n', '')
proxy_list.append(line)
def get_proxy():
proxy = random.choice(proxy_list)
proxies = {
"http": f'{str(proxy)}',
"https": f'{str(proxy)}'
}
return proxies
# Opening URL file
with open('urls.txt','r') as file:
for url in file.readlines():
proxies = get_proxy()
result = requests.get(url,headers=header,timeout=3,proxies=proxies)
soup = BeautifulSoup(result.content, 'lxml')
Thanks for helping.
Related
I have the following code (I have X'd out my user name and password). The code runs fine however how do I know if its using the proxy IP address to complete this scrape? I am teaching myself from scratch how to do this so any help is appreciated as I know there is so much to learn!
import requests
from bs4 import BeautifulSoup
from random import choice
import pandas as pd
def start_requests(self):
for url in self.start_urls:
return Request(url=url, callback=self.parse,
headers={"User-Agent": "My user agent"},
meta={"proxy": "http://username:/password#p.proxyegg.com:8080"})
def getProduct_Data(tag):
url = f'https://www.whiteline.com.au/product_detail4.php?part_number={tag}'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
Product_Data = {
'sku': soup.find("div",{"class":"head2BR"}).text,
'product': soup.find("div",{"style":"float:left; margin-left:24px; margin-top:8px; margin-right:14px;"}).text,
'description': soup.find("div",{"style":"min-height:80px; max-height:224px; overflow-y: scroll;"}).text.strip(),
'price': soup.find("div",{"class":"float_Q"}).text.strip(),
'features': soup.find("div",{"class":"grey1"}).text.strip(),
'contents': soup.find("div",{"style":"max-height:28px; overflow-y:scroll;"}).text.strip(),
'compatiblity': soup.find("div",{"style":"width:960px; margin:auto; padding-top:18px;"}).text.strip(),
}
url_list.append(Product_Data)
return
getProduct_Data('KBR15')
getProduct_Data('W13374')
getProduct_Data('BMR98')
getProduct_Data('W51210')
getProduct_Data('W51211')
getProduct_Data('W92498')
getProduct_Data('W93404')
getProduct_Data('W92899')
getProduct_Data('W51710')
getProduct_Data('W53277')
getProduct_Data('W53379')
getProduct_Data('BSK010M')
getProduct_Data('KSB568')
df = pd.DataFrame(url_list)
df.to_csv('whitelinefull.csv')
print('Fin.')
One way to test the efficacy of your proxy is to scrape an IP lookup/location site and check that the IP the site detects via the request is the same as the one your proxy is using:
import requests, re
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://ip.me/', proxies=proxies, headers=headers).text, 'html.parser')
assert d.select_one('input[name="ip"]')['value'] == re.findall('(?:\d+\.)+\d+', proxies['https'])[0]
The detected IP from https://ip.me/ can be found in d.select_one('input[name="ip"]')['value'].
It may be possible that instead of an explicit IP in your proxies dictionary, a non-IP URL with login credentials might be used instead (which appears to be the case from your post). To check that the proxy provider is working correctly, you can send a request to https://ip.me/ both with and without the proxies and compare the results:
d = soup(requests.get('https://ip.me/', proxies=proxies, headers=headers).text, 'html.parser')
d1 = soup(requests.get('https://ip.me/', headers=headers).text, 'html.parser')
assert d.select_one('input[name="ip"]')['value'] == d1.select_one('input[name="ip"]')['value'], "proxy does not seem to be working"
I am going to extract posts in a forum, named positive wellbeing during isolation" in HealthUnlocked.com
I can extract posts without login, but I cannot extract posts with logging. I used " url = 'https://solaris.healthunlocked.com/posts/positivewellbeing/popular?pageNumber={0}'.format(page)" to extract pots, but I don't know how I can connect it to login as the URL is in JSON format.
I would appreciate it if you could help me.
import requests, json
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
url = "https://healthunlocked.com/private/programs/subscribed?user-id=1290456"
payload = {
"username" : "my username goes here",
"Password" : "my password goes hereh"
}
s= requests.Session()
p= s.post(url, data = payload)
headers = {"user-agent": "Mozilla/5.0"}
pages =2
data = []
listtitles=[]
listpost=[]
listreplies=[]
listpostID=[]
listauthorID=[]
listauthorName=[]
for page in range(1,pages):
url = 'https://solaris.healthunlocked.com/posts/positivewellbeing/popular?pageNumber=
{0}'.format(page)
r = requests.get(url,headers=headers)
posts = json.loads(r.text)
for post in posts:
sleep(3.5)
listtitles.append(post['title'])
listreplies.append(post ["totalResponses"])
listpostID.append(post["postId"])
listauthorID.append(post ["author"]["userId"])
listauthorName.append(post ["author"]["username"])
url = 'https://healthunlocked.com/positivewellbeing/posts/{0}'.format(post['postId'])
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
listpost.append(soup.select_one('div.post-body').get_text('|', strip=True))
## save to CSV
df=pd.DataFrame(list(zip(*
[listpostID,listtitles,listpost,listreplies,listauthorID,listauthorName]))).add_prefix('Col')
df.to_csv('out1.csv',index=False)
print(df)
sleep(2)
For most websites, you have to first get a token by logging in. Most of the time, this is a cookie. Then, in authorized requests, you can send that cookie along. Open the network tab in developer tools and then log in with your username and password. You'll be able to see how the request is formatted and where it is too. From there, just try to replicate it in your code.
'''
I would like to perform web-scraping for website which requires login. I tried two different code approach. Still I am unable to perform login.
'''
#Develop code in Python using BeautifulSoup:
#First Approach
import requests
from bs4 import BeautifulSoup
import http.cookiejar
import urllib.request
import urllib.parse
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
urllib.request.install_opener(opener)
authentication_url=
'http://127.0.0.1/orangehrm4.3.1/symfony/web/index.php/auth/login'
payload = {'txtUsername': '<username>', 'txtPassword': '<password>'}
data = urllib.parse.urlencode(payload).encode("utf-8")
req = urllib.request.Request(authentication_url, data)
resp = urllib.request.urlopen(req)
contents = resp.read()
#Second Approach
import requests
from lxml import html
USERNAME = "<USERNAME>"
PASSWORD = "<PASSWORD>"
LOGIN_URL =
"http://127.0.0.1/orangehrm-4.3.1/symfony/web/index.php/auth/login"
URL = "http://127.0.0.1/orangehrm-4.3.1/symfony/web/index.php/dashboard"
def main():
session_requests = requests.session()
# Getting login csrf token
result = session_requests.get(LOGIN_URL)
tree = html.fromstring(result.text)
crsf_token =
list(set(tree.xpath("//input[#name='_csrf_token']/#value")))[0]
payload = {
"txtUsername": USERNAME,
"txtPassword": PASSWORD,
"_csrf_token": authenticity_token
}
result = session_requests.post(LOGIN_URL, data = payload, headers =
dict(referer = LOGIN_URL))
#URL to scrape
result = session_requests.get(URL)
soup = BeautifulSoup(result.text, 'html.parser')
div = soup.find('div', id='branding')
print(div)
if __name__ == '__main__':
main()
'''
After using these methods, we are only able to get data for the login page. I assume that we are not able to login as I want to access data of pages after login.
It would be very helpful to figure out how web scraping can be performed after getting logged in into the website using Python, BeautifulSoup.
'''
Try to use authorization methods which is available instead of using payload methods. For example here am using HTTPBasicAuth
import requests
from requests.auth import HTTPBasicAuth
USERNAME = "<USERNAME>"
PASSWORD = "<PASSWORD>"
BASIC_AUTH = HTTPBasicAuth(USERNAME, PASSWORD)
LOGIN_URL = "http://127.0.0.1/orangehrm-4.3.1/symfony/web/index.php/auth/login"
response = requests.get(LOGIN_URL,headers={},auth=BASIC_AUTH)
So as it stands I am able to get the content of the webpage of the PDF link EXAMPLE OF THE LINK HERE BUT, I don't want the content of the webpage I want the content of the PDF so I can put the content into a PDF on my computer in a folder.
I have been successful in doing this on sites that I don't need to log into and without a proxy server.
Relevant CODE:
import os
import urllib2
import time
import requests
import urllib3
from random import *
s = requests.Session()
data = {"Username":"username", "Password":"password"}
url = "https://login.url.com"
print "doing things"
r2 = s.post(url, data=data, proxies = {'https' : 'https://PROXYip:PORT'}, verify=False)
#I get a response 200 from printing r2
print r2
downlaod_url = "http://msds.walmartstores.com/client/document?productid=1000527&productguid=54e8aa24-0db4-4973-a81f-87368312069a&DocumentKey=undefined&HazdocumentKey=undefined&MSDS=0&subformat=NAM"
file = open("F:\my_filepath\document" + str(maxCounter) + ".pdf", 'wb')
temp = s.get(download_url, proxies = {'https' : 'https://PROXYip:PORT'}, verify=False)
#This prints out the response from the proxy server (i.e. 200)
print temp
something = uniform(5,6)
print something
time.sleep(something)
#This gets me the content of the web page, not the content of the PDF
print temp.content
file.write(temp.content)
file.close()
I need help figuring out how to "download" the content of the PDF
try this:
import requests
url = 'http://msds.walmartstores.com/client/document?productid=1000527&productguid=54e8aa24-0db4-4973-a81f-87368312069a&DocumentKey=undefined&HazdocumentKey=undefined&MSDS=0&subformat=NAM'
pdf = requests.get(url)
with open('walmart.pdf', 'wb') as file:
file.write(pdf.content)
Edit
Try again with a requests session to manage cookies (assuming they send you those after login) and also maybe a different proxy
proxy_dict = {'https': 'ip:port'}
with requests.Session() as session:
# Authentication request, use GET/POST whatever is needed
# data variable should hold user/password information
auth = session.get(login_url, data=data, proxies=proxy_dict, verify=False)
if auth.status_code == 200:
print(auth.cookies) # Tell me if you got anything
pdf = auth.get('download_url') # Were continuing the same session
with open('walmart.pdf', 'wb') as file:
file.write(pdf.content)
else:
print('No go, got {0} response'.format(auth.status_code))
I am trying to get the output page once we submit input on this page. http://batblob.com
Below is my code but it is giving me the html code of same page rather than of output page.
I am on Python 2.7, Please suggest me how should I get there.
Thank you in advance.
import requests
import sys
from BeautifulSoup import BeautifulSoup
url = 'http://batblob.com'
session = requests.Session()
form = {
'bitadd':'anybitcoinaddress',
'submit': 'submit',
}
r = requests.post( url, data=form)
response = session.get(url)
html = response.content
soup = BeautifulSoup(html)
print soup
In this case the data you want is already in your r variable. I would assume the following will get what you are after.
import requests
from BeautifulSoup import BeautifulSoup
url = 'http://batblob.com'
form = {
'bitadd':'anybitcoinaddress',
'submit': 'submit',
}
# Post form data and parse response
response = requests.post(url, data=form)
soup = BeautifulSoup(response.content)
print(soup)