Trying to fake and rotating user agents - python

I am trying to fake user agents as well as rotate them in Python.
I found a tutorial online about how to do this with Scrapy using scrapy-useragents package.
I scrape the webpage, https://www.whatsmyua.info/, in order to check my user agent to see if it is different then mine and if it rotates. Is it different then my actual user agent but it does not rotate it returns the same user agent each time, and I cannot figure out what is going wrong.
settings.py
BOT_NAME = 'scrapy_javascript'
SPIDER_MODULES = ['scrapy_javascript.spiders']
NEWSPIDER_MODULE = 'scrapy_javascript.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapy_javascript (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
# -----------------------------------------------------------------------------
# USER AGENT
# -----------------------------------------------------------------------------
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': 500,
}
USER_AGENTS = [
('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/57.0.2987.110 '
'Safari/537.36'), # chrome
('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/61.0.3163.79 '
'Safari/537.36'), # chrome
('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) '
'Gecko/20100101 '
'Firefox/55.0'), # firefox
('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/61.0.3163.91 '
'Safari/537.36'), # chrome
('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/62.0.3202.89 '
'Safari/537.36'), # chrome
('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.108 '
'Safari/537.36'), # chrome
]
SPLASH_URL = 'http://199.89.192.74:8050'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

Here you can find an API returning the most common user-agents as JSON :
http://51.158.74.109/useragents/?format=json
I have used this tool which will keep your list of user-agents always updated with most recent and most used user-agents : https://pypi.org/project/shadow-useragent/
from shadow_useragent import ShadowUserAgent
shadow_useragent = ShadowUserAgent()
print(shadow_useragent.firefox)
# Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0
print(shadow_useragent.chrome)
# Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36
print(shadow_useragent.safari)
# Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15
print(shadow_useragent.edge)
# Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134
print(shadow_useragent.ie)
# Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
print(shadow_useragent.android)
# Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30
print(shadow_useragent.ipad)
# Mozilla/5.0 (iPad; CPU OS 12_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Mobile/15E148 Safari/604.1
print(shadow_useragent.random)
# Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0
print(shadow_useragent.random_nomobile)
# Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36
# and the best one, random via real world browser usage statistic
print(ua.random)
# Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36
# if you want to excluse mobiles (some websites will display different pages)
print(shadow_useragent.random_nomobile)
# Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36

Figured it out by creating csv file with all my url's and they are paired with IP's and user agents and so every time I access the webpage I use those IP's and user agents. Then I had to override my spalsh_url in my spider that way my splash_url would be equal to the proxy I am using at that moment.
SplashSpider.py
import csv
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import GameItem
# process the csv file so the url + ip address + useragent pairs are the same as defined in the file
# returns a list of dictionaries, example:
# [ {'url': 'http://www.starcitygames.com/catalog/category/Rivals%20of%20Ixalan',
# 'ip': 'http://204.152.114.244:8050',
# 'ua': "Mozilla/5.0 (BlackBerry; U; BlackBerry 9320; en-GB) AppleWebKit/534.11"},
# ...
# ]
def process_csv(csv_file):
data = []
reader = csv.reader(csv_file)
next(reader)
for fields in reader:
if fields[0] != "":
url = fields[0]
else:
continue # skip the whole row if the url column is empty
if fields[1] != "":
ip = "http://" + fields[1] + ":8050" # adding http and port because this is the needed scheme
if fields[2] != "":
useragent = fields[2]
data.append({"url": url, "ip": ip, "ua": useragent})
return data
class MySpider(Spider):
name = 'splash_spider' # Name of Spider
# notice that we don't need to define start_urls
# just make sure to get all the urls you want to scrape inside start_requests function
# getting all the url + ip address + useragent pairs then request them
def start_requests(self):
# get the file path of the csv file that contains the pairs from the settings.py
with open(self.settings["PROXY_CSV_FILE"], mode="r") as csv_file:
# requests is a list of dictionaries like this -> {url: str, ua: str, ip: str}
requests = process_csv(csv_file)
for req in requests:
# no need to create custom middlewares
# just pass useragent using the headers param, and pass proxy using the meta param
yield SplashRequest(url=req["url"], callback=self.parse, args={"wait": 3},
headers={"User-Agent": req["ua"]},
splash_url = req["ip"],
)
settings.py
BOT_NAME = 'scrapy_javascript'
SPIDER_MODULES = ['scrapy_javascript.spiders']
NEWSPIDER_MODULE = 'scrapy_javascript.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# The path of the csv file that contains the pairs
PROXY_CSV_FILE = "proxies.csv"
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
#SPLASH_URL = 'http://127.0.0.1:8050'
#SPLASH_URL = 'http://localhost:8050'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 16
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 60

Check out https://www.useragents.me/. It is an auto-updating list of the latest useragents available in JSON format (auto-updating so, even if the site is neglected, the data shouldn't go stale).
Conveniently they even offer a python coded useragent rotator function directly on the site (with useragents current as of November 4, 2022):
import random
def random_ua(k=1):
# returns a random useragent from the latest user agents strings list, weighted
# according to observed prevalance
ua_pct = {"ua": {"0": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36", "1": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0", "2": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36", "3": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0", "4": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", "5": "Mozilla/5.0 (X11; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0", "6": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", "7": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36", "8": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36", "9": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15", "10": "Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0", "11": "Mozilla/5.0 (Windows NT 10.0; rv:105.0) Gecko/20100101 Firefox/105.0", "12": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:105.0) Gecko/20100101 Firefox/105.0", "13": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0", "14": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"}, "pct": {"0": 28.8, "1": 13.28, "2": 10.98, "3": 8.55, "4": 6.25, "5": 5.56, "6": 4.53, "7": 4.27, "8": 3.57, "9": 2.93, "10": 2.99, "11": 2.55, "12": 2.44, "13": 1.7, "14": 1.59}}
return random.choices(list(ua_pct['ua'].values()), list(ua_pct['pct'].values()), k=k)
# usage
random_ua() # optional k = number of useragents to return

Related

How to avoid Scrapy url being redirected

I'm trying to scrape a website using scrapy. Sometimes my code is working but sometimes it is not working. The reason my code cannot scrape the web page is because the url gets redirected. For example, I was trying to scrape www.example.com/page=1, which was redirected into www.verify.example.com. It appears to move you into the verify version of the url in order to check that you are a human. I have tried using proxy with many user-agents but this is not working. I think this was the problem when I was requesting the url.
res = scrapy.Request(
url=the_url,
meta={
"proxy":"https://"+proxy,
},
callback=self.next_page
The proxy I was getting from some page. Here is the code:
def get_proxies(self,empty):
url = 'https://free-proxy-list.net/'
response = requests.get(url)
parser = fromstring(response.text)
list_ip = []
proxies = set()
for i in parser.xpath('//tbody/tr')[:100]:
if i.xpath('.//td[7][contains(text(),"yes")]'):
#Grabbing IP and corresponding PORT
proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
proxies.add(proxy)
for loop_proxies in proxies:
list_ip.append(loop_proxies)
return list_ip
I got the "get_proxies" functioning from internet. And in settings.py it was like this :
# -*- coding: utf-8 -*-
# Scrapy settings for example project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'example'
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'example (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
LOG_LEVEL = 'INFO'
LOG_ENABLED = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS = 4
CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_REQUESTS_PER_IP = 1
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# DEFAULT_REQUEST_HEADERS = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
# 'Cookie':'s_ViewType=10; _lxsdk_s=16eb7c37081-08b-750-06a%7C%7C200; _lxsdk_cuid=16eb8d09cf5c8-05ca75fa4b8f66-14291003-100200-16eb8d09cf5c8; _lxsdk=16eb8d09cf5c8-05ca75fa4b8f66-14291003-100200-16eb8d09cf5c8; _hc.v=8f832efa-3775-aeec-1189-37c6d815ecb0.1575058712'
# 'Cookie':'ALF=1573490222; _T_WM=148616430989eb5a607186e5ccc1bd5a4754b18b318; SCF=AlnIW_LpGKigwbo6ysyWWwVvTa_owlI2qJO_J1CxkMPEGosrBTKlnFR2fkvt82OUSt8ALZaDaUYGDQ9K0TtOAK8.; SUB=_2A25wpnFhDeRhGeFN6loW9CrPyDiIHXVQaR8prDV6PUJbktBeLRahkW1NQEqapQmLSHVEFcFyeIkn1OBaYewoa96T; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFEx4XGa4r.ozPRKdw4_Wmx5JpX5K-hUgL.FoM0eKnNShB0e0B2dJLoIpRLxK-LB-BL1KBLxK-LBKBLB-zLxK-LB-BL1K5peo-t; SUHB=0TPVAlkzF8-YRN; SSOLoginState=1570898225; MLOGIN=1; _T_WL=1'
# }
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'example.middlewares.DianpingScrapeSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'example.middlewares.DianpingScrapeDownloaderMiddleware': 543,
'example.middlewares.RotateUserAgentMiddleware': 543,
# 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
# 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'example.pipelines.DianpingScrapePipeline': 300,
# 'example.downloadermiddlewares.cookies.CookiesMiddleware': 700,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPERROR_ALLOWED_CODES = [403]
And I added a new function inside the middlewares.py to add the user-agent as follows:
class RotateUserAgentMiddleware(UserAgentMiddleware):
def __init__(self, user_agent=''):
self.user_agent = user_agent
def process_request(self, request, spider):
ua = random.choice(self.user_agent_list)
if ua:
request.headers.setdefault('User-Agent', ua)
#the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
#for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
And I got different warnings:
twisted.internet.error.TCPTimedOutError: TCP connection timed out: 110: Connection timed out.
[scrapy.core.scraper] ERROR: Error downloading <GET http://www.example.com/>
twisted.web._newclient.ResponseNeverReceived: [<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
Crawled 13 pages (at 0 pages/min), scraped 0 items (at 0 items/min)

Multi socket connection to server Python

I get this error within python socket. My aim is to create a slow lorris attack but I am having problems with getting multi connections to my router within the one program
I want to get the amount of sockets within a list to call
import socket
import time
import os
import random
socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
user_agents = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Safari/602.1.50",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:49.0) Gecko/20100101 Firefox/49.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Safari/602.1.50",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0",
]
def clear():
os.system("cls")
print("Starting...")
print("press enter to start")
agreement = input("")
port = 445
port_s = str(port)
socket.settimeout(4)
list_of_sockets = []
import socket
servers = [] #add servers here
if agreement == "":
clear()
print("")
print("welcome...")
target = input("ip target>>>")
print("defult port is " + port_s)
print("-" * 10 + " START " + "-" * 10)
print("")
def connect_to():
int_nob = int(200)#num of bots
for x in range(0, int_nob):
print(int(int_nob))
int_nob -= 1
int_nob = socket.connect((target, port))
int_nob.send("User-Agent: {}\r\n".format(random.choice(user_agents)).encode("utf-8"))
client = new Client()
if int_nob == 0:
print(list_of_sockets)
print("resending sockets of " + int_nob)
while True:
connect_to()
else:
print("breaking...")
exit()
error
Traceback (most recent call last):
File "C:\Users\townte23\Desktop\slow lorris,.py", line 78, in <module>
connect_to()
File "C:\Users\townte23\Desktop\slow lorris,.py", line 71, in connect_to
a = socket.connect((target, port))
OSError: [WinError 10056] A connect request was made on an already connected socket
I did cannibalize someone elses code but most of it is mine
https://github.com/gkbrk/slowloris/blob/master/slowloris.py
I did find a similar issue but it was a server issue so I'm not sure how to approach this error
Edit: found the problem and the solution
int_nob = int(200)
for
int_nob = socket.connect(())
int_nob.send(bytes("thing"))
int_nob -= 1
socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
...
def connect_to():
...
for x in range(0, int_nob):
...
a = socket.connect((target, port))
...
while True:
connect_to()
You cannot do multiple connects with the same socket but you have to create a new socket for each new connection. Apart from that it is very confusing that you call your socket simply socket since this conflicts in name with the module socket you import.

Avoid to be detected during scrape

I am trying to scrape the website of lacentrale.fr thanks to scrapy, but even if I rotate my users agent and IP address (thanks to TOR), the web site detect my robot and send me false values.
Please can you check my code used in middlwares and setting and tell me if something went wrong.
code in middlewares :
from tutorial.settings import * #USER_AGENT_LIST
import random
from stem.control import Controller
from toripchanger import TorIpChanger
from stem import Signal
class RandomUserAgentMiddleware(object):
def process_request(self, request, spider):
ua = random.choice(USER_AGENT_LIST)
if ua:
request.headers.setdefault('User-Agent', ua)
def _set_new_ip():
with Controller.from_port(port=9051) as controller:
controller.authenticate(password='')
controller.signal(Signal.NEWNYM)
ip_changer = TorIpChanger(reuse_threshold=10)
class ProxyMiddleware(object):
_requests_count = 0
def process_request(self, request, spider):
self._requests_count += 1
if self._requests_count > 10:
self._requests_count = 0
ip_changer.get_new_ip()
print("New Tor connection processed")
request.meta['proxy'] = 'http://127.0.0.1:8118'
spider.log('Proxy : %s' % request.meta['proxy'])
Code used in settings :
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
'tutorial.middlewares.RandomUserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware':None, # to avoid the raise IOError, 'Not a gzipped file' exceptions.IOError: Not a gzipped file
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,
'tutorial.middlewares.ProxyMiddleware': 100
}
USER_AGENT_LIST=[
{'User-agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0 Safari/537.36'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0 Safari/537.36'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'},
{'User-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:35.0) Gecko/20100101 Firefox/35.0'},
{'User-agent': 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0'}
]
EDIT II
it's seems that tor use the same ip each time and there is not rotation on the ip address. I don't know what I can change in my middlwares file to resolve this !!
please any idea ?
You may be detected on several factors, including whether your scraper downloads/runs the javascript files. If that's the case, you may need to use a tool like selenium in conjunction with Python/Scrapy to further pretend like a normal human user.
This stackoverflow post offers some help in getting started:
https://stackoverflow.com/a/17979285/9693088
I don't think I can offer much guidance in what may be going wrong with your TOR set up

Urllib3 HTTP Error 502: Bad Gateway

I am trying to scrape zk.fm in order to download music, but it's giving me some trouble. I'm using urllib3 to generate a response, but this always yields a Bad Gateway error. Accessing the website through a browser works perfectly fine.
This is my code (with a random fake user-agent). I'm trying to access "http://zk.fm/mp3/search?keywords=" followed by some keywords which indicate the song name and artist, for example "http://zk.fm/mp3/search?keywords=childish+gambino+heartbeat".
from bs4 import BeautifulSoup
from random import choice
import urllib3
desktop_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']
def random_headers():
return {'User-Agent': choice(desktop_agents)}
ua = random_headers()
http = urllib3.PoolManager(10,headers=user_agent)
response = http.request('GET',"http://zk.fm/mp3/search?
keywords=childish+gambino+heartbeat")
soup = BeautifulSoup(response.data)
Is there a way to work around the 502 Error, or is it out of my control?
You need to enable the persistence of cookies, then access, in order, the site home page followed by the search URL. I suggest (personally) python-requests, but it is up to you. See here for discussion.
I tested this by visiting the search page - error 502. visit home page - 200. visit search - 200. clear cookies and visit search again - 502. So it must be cookies that are the problem.

Web parsing with python beautifulsoup producing inconsistent result

I am trying to parse the table of this site. I am using python beautiful soup to do that. While it's producing correct output in my Ubuntu 14.04 machine, it's producing wrong output in my friend's windows machine. I am pasting the code snippet here:
from bs4 import BeautifulSoup
def buildURL(agi, families):
#agi and families contains space seperated string of genes and families
genes = agi.split(" ")
families = families.split(" ")
base_url = "http://www.athamap.de/search_gene.php"
url = base_url
if len(genes):
url = url + "?agi="
for i, gene in enumerate(genes):
if i>0:
url = url + "%0D%0A"
url = url + gene
url = url + "&upstream=-500&downstream=50&restriction=0&sortBy1=gen&sortBy2=fac&sortBy3=pos"
for family in families:
family = family.replace("/", "%2F")
url = url +"&familySelected%5B"+family+"%5D=on"
url = url + "&formSubmitted=TRUE"
return url
def fetch_html(agi, families):
url = buildURL(agi, families)
response = requests.get(url)
soup = BeautifulSoup(str(response.text), "lxml")
divs = soup.find_all('div')
seldiv = ""
for div in divs:
try:
if div["id"] == "geneAnalysisDetail":
'''
This div contains interesting data
'''
seldiv = div
except:
None
return seldiv
def parse(seldiv):
soup = seldiv
rows= soup.find_all('tr')
attributes =["Gene", "Factor", "Family", "Position", "Relative orientation", "Relative Distance", "Max score", "Threshold Score", "Score"]
print attributes
save_rows = []
for i in range(2, len(rows)):
cols = rows[i].find_all('td')
lst = []
for j,col in enumerate(cols):
if j==0:
lst.append(re.sub('', '',str(col.contents[1].contents[0])))
elif j==1:
lst.append(str(col.contents[1].contents[0]))
elif j==2:
lst.append(str(col.contents[0]))
elif j==3:
lst.append(str(col.contents[1].contents[0]))
else:
lst.append(str(col.contents[0]))
save_rows.append(lst)
return save_rows
Any idea what could go wrong here? I have tried with and without lxml.
Thanks in advance.
You can parse the table this way and should work well on both machine. buildURL function should be left unchanged.
import requests
from bs4 import BeautifulSoup
def fetch_html(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
seldiv = soup.find("div", id="geneAnalysisDetail")
return seldiv
def parse(url):
soup = fetch_html(url)
rows= soup.find_all("tr")
attributes = ["Gene", "Factor", "Family", "Position", "Relative orientation", "Relative Distance", "Max score", "Threshold Score", "Score"]
save_rows = []
for i in range(2, len(rows)):
cols = rows[i].find_all("td")
lst = []
for col in cols:
text = col.get_text()
text = text.strip(" ")
text = text.strip("\n")
lst.append(text)
save_rows.append(lst)
return save_rows
url = "http://www.athamap.de/search_gene.php?agi=At1g76540%0D%0AAt3g12280%0D%0AAt4g28980%0D%0AAt4g37630%0D%0AAt5g11300%0D%0AAt5g27620%0D%0A&upstream=-500&downstream=50&restriction=0&sortBy1=gen&sortBy2=fac&sortBy3=pos&familySelected[ARF]=on&familySelected[CAMTA]=on&familySelected[GARP%2FARR-B]=on&formSubmitted=TRUE"
save_rows = parse(url)
for row in save_rows:
print(row)
One possibility is that you didn't add user agent for the requests. Different user agent will get different result sometime, especially from weird website. Here is a list of all possible agents, just choose one. It doesn't have to be your machine
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/602.4.8 (KHTML, like Gecko) Version/10.0.3 Safari/602.4.8',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.4.8 (KHTML, like Gecko) Version/10.0.3 Safari/602.4.8',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:51.0) Gecko/20100101 Firefox/51.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
]

Categories

Resources