Downloading a webpage using urllib2 results in garbled junk? (only sometimes) - python

How come I hit this webpage, I get HTML text:
http://itunes.apple.com/us/app/mobile/id381057839
But when I hit this webpage, I get garbled junk?
http://itunes.apple.com/us/app/mobile/id375562663
I use the same download() function in python, which is here:
def download(source_url):
try:
socket.setdefaulttimeout(10)
agent = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.10) Gecko/20100914 AlexaToolbar/alxf-1.54 Firefox/3.6.10 GTB7.1"
ree = urllib2.Request(source_url)
ree.add_header('User-Agent',agent)
ree.add_header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
ree.add_header("Accept-Language","en-us,en;q=0.5")
ree.add_header("Accept-Charset","ISO-8859-1,utf-8;q=0.7,*;q=0.7")
ree.add_header("Accept-Encoding","gzip,deflate")
ree.add_header("Host","itunes.apple.com")
resp = urllib2.urlopen(ree)
htmlSource = resp.read()
return htmlSource
except Exception, e:
print e

Solved. It was compression issue.
def download(source_url):
try:
socket.setdefaulttimeout(10)
agents = ['Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)','Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)','Microsoft Internet Explorer/4.0b1 (Windows 95)','Opera/8.00 (Windows NT 5.1; U; en)']
ree = urllib2.Request(source_url)
ree.add_header('User-Agent',random.choice(agents))
ree.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
h = opener.open(ree).read()
import StringIO
import gzip
compressedstream = StringIO.StringIO(h)
gzipper = gzip.GzipFile(fileobj=compressedstream)
data = gzipper.read()
return data
except Exception, e:
print e
return ""

Related

Instagram story scraper: What would the process be?

I'm trying to code a web scraping python program that gets stories from users with your login. I thought It would be fun to see if I could even get it working since the 4k Stogram costs money just for more functionality.
I logged in successful but I don't know where to go from here.
from bs4 import BeautifulSoup
import json, random, re, requests, urllib.request
import urllib2
USERNAME = '*****'
PASSWD = '****'
account_purging = '****'
BASE_URL = 'https://www.instagram.com/accounts/login/'
LOGIN_URL = BASE_URL + 'ajax/'
headers_list = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; FSL 7.0.6.01001)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; FSL 7.0.7.01001)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; FSL 7.0.5.01003)",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0",
"Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8",
"Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:11.0) Gecko/20100101 Firefox/11.0",
"Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.0.3705)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)",
"Opera/9.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.01",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows NT 5.1; rv:5.0.1) Gecko/20100101 Firefox/5.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.02",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1",
"Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.0) Opera 7.02 Bork-edition [en]"
]
USER_AGENT = headers_list[random.randrange(0,(len(headers_list)+1))]
session = requests.Session()
session.headers = {'user-agent': USER_AGENT}
session.headers.update({'Referer': BASE_URL})
req = session.get(BASE_URL)
soup = BeautifulSoup(req.content, 'html.parser')
body = soup.find('body')
pattern = re.compile('window._sharedData')
script = body.find("script", text=pattern)
script = script.get_text().replace('window._sharedData = ', '')[:-1]
data = json.loads(script)
csrf = data['config'].get('csrf_token')
login_data = {'username': USERNAME, 'password': PASSWD}
session.headers.update({'X-CSRFToken': csrf})
login = session.post(LOGIN_URL, data=login_data, allow_redirects=True)
story_page = "https://www.instagram.com/stories" + "/" + account_purging
# stories url is:
request_headers_story = {
"Accept:" : "video/webm,video/ogg,video/*;q…q=0.7,audio/*;q=0.6,*/*;q=0.5",
"Accept-Language" : "en-US,en;q=0.5",
"Connection" : "keep-alive",
"DNT" : "1",
"Host" : "scontent-ort2-1.cdninstagram.com",
"Range" : "bytes=0-",
"Referer" : story_page,
"TE" : "Trailers",
"User-Agent" : USER_AGENT
}
soup = session.post(story_page, data=request_headers_story, allow_redirects=True)
print(BeautifulSoup(soup.content, 'html.parser'))
I'm trying to get the mp4 and jpg links and using that to download later in an array or something. If there's anything you could point me towards I would appreciate anything.
I'm also trying to avoid using the api because, that just makes it boring.
The easier solution to this problem which avoids using an api is to use selenium. By using selenium, you can login in much faster and efficiently as well as grab the images and videos you need.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
#or driver = webdriver.Chrome()
Note: To grab the image, you need to find the id or name of the image and do something like:
driver.find_element_by_id("image_id")
or
driver.find_element_by_name("image_name")
If you need more information or clarification, check https://selenium-python.readthedocs.io/.
Let me know if that helped you!

Python urllib.urlretrieve and user agent

I am retrieving an xml file from a network device. It returns the file in a different format without html tags if I do not specify a user agent.
import urllib
urllib.urlretrieve (url, file_save_name)
How do I specify a user agent when retrieving?
Sounds like you could do.
import urllib
# User-Agents for multiple browsers and OS's
user_agents = [ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
# Add user_agent as header argument.
urllib.urlretrieve (url, file_save_name, user_agent)

Urllib bad request issue

I tried every 'User-Agent' in here, still I get urllib.error.HTTPError: HTTP Error 400: Bad Request. I also tried this, but I get urllib.error.URLError: File Not Found. I have no idea what to do, my current codes are;
from bs4 import BeautifulSoup
import urllib.request,json,ast
with open ("urller.json") as f:
cc = json.load(f) #the file I get links, you can try this link instead of this
#cc = ../games/index.php?g_id=23521&game=0RBITALIS
for x in ast.literal_eval(cc): #cc is a str(list) so I have to convert
if x.startswith("../"):
r = urllib.request.Request("http://www.game-debate.com{}".format(x[2::]),headers={'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'})
#x[2::] because I removed '../' parts from urlls
rr = urllib.request.urlopen(r).read()
soup = BeautifulSoup(rr)
for y in soup.find_all("ul",attrs={'class':['devDefSysReqList']}):
print (y.text)
Edit: If you try only 1 link probably it won't show any error, since I get the error every time at 6th link.
A quick fix is to replace the space with +:
url = "http://www.game-debate.com"
r = urllib.request.Request(url + x[2:] ,headers={'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'})
A better option may be to let urllib quote the params:
from bs4 import BeautifulSoup
import urllib.request,json,ast
from urllib.parse import quote, urljoin
with open ("urller.json") as f:
cc = json.load(f) #the file I get links, you can try this link instead of this
url = "http://www.game-debate.com"
for x in ast.literal_eval(cc): # cc is a str(list) so I have to convert
if x.startswith("../"):
r = urllib.request.Request(urljoin(url, quote(x.lstrip("."))), headers={
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'})
rr = urllib.request.urlopen(r).read()
soup = BeautifulSoup(rr)
print(rr.decode("utf-8"))
for y in soup.find_all("ul", attrs={'class':['devDefSysReqList']}):
print (y.text)
Spaces in a url are not valid and need to be percent encoded as %20 or replaced with +.

Python - Urllib.Request - Change location of downloaded file

How can i choose the position in which the downloaded file is stored? My code:
import urllib.request
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers = {'User-Agent':user_agent,}
request = urllib.request.Request("http://download.thinkbroadband.com/5MB.zip",None,headers)
response = urllib.request.urlopen(request)
data = response.read()
You're almost there. So you've got data:
ofile = open(where_you_want_to_store_the_data,"wb")
ofile.write(data)
ofile.close()
For a cleaner way you can use urlretrieve:
urlretrieve(url, "/path/to/something.txt")

using python 3.5 saving csv from url drops CR and LF

I'm using Python 3.5.0 to grab some census data. When I use my script it does retrieve the data from the url and saves it but the file that was saved can't be imported to SQL because it somehow dropped the {CR}{LF}. How can I get the file it saves able of being imported to SQL?
try:
url = 'https://www.census.gov/popest/data/counties/asrh/2014/files/CC-EST2014-ALLDATA.csv'
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0'
req = urllib.request.Request(url,headers=headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
saveFile = open('Vintage2014.csv' ,'w')
saveFile.write(str(respData))
saveFile.close()
except Exception as e:
print(str(e))
Note, the file you are trying to download does not contain CRLF only LF.
You could use the following approach to convert the bytes to a suitable string. This should also result in you getting CRLF:
import urllib.request
try:
url = 'https://www.census.gov/popest/data/counties/asrh/2014/files/CC-EST2014-ALLDATA.csv'
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0'
req = urllib.request.Request(url, headers=headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
with open('Vintage2014.csv', 'w') as saveFile:
saveFile.write(respData.decode('latin-1'))
except Exception as e:
print(str(e))

Categories

Resources