using python 3.5 saving csv from url drops CR and LF - python

I'm using Python 3.5.0 to grab some census data. When I use my script it does retrieve the data from the url and saves it but the file that was saved can't be imported to SQL because it somehow dropped the {CR}{LF}. How can I get the file it saves able of being imported to SQL?
try:
url = 'https://www.census.gov/popest/data/counties/asrh/2014/files/CC-EST2014-ALLDATA.csv'
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0'
req = urllib.request.Request(url,headers=headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
saveFile = open('Vintage2014.csv' ,'w')
saveFile.write(str(respData))
saveFile.close()
except Exception as e:
print(str(e))

Note, the file you are trying to download does not contain CRLF only LF.
You could use the following approach to convert the bytes to a suitable string. This should also result in you getting CRLF:
import urllib.request
try:
url = 'https://www.census.gov/popest/data/counties/asrh/2014/files/CC-EST2014-ALLDATA.csv'
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0'
req = urllib.request.Request(url, headers=headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
with open('Vintage2014.csv', 'w') as saveFile:
saveFile.write(respData.decode('latin-1'))
except Exception as e:
print(str(e))

Related

requests.get(link, headers=headers).json() fails to load but on browser it load ok

I am trying to access an url with requests.get but on terminal I always get this error:
Failed to open https://api-mainnet.magiceden.io/rpc/getListedNFTsByQuery?q=%7B%22%24match%22%3A%7B%22collectionSymbol%22%3A%22meekolony%22%7D%2C%22%24sort%22%3A%7B%22takerAmount%22%3A1%2C%22createdAt%22%3A-1%7D%2C%22%24skip%22%3A0%2C%22%24limit%22%3A5%7D
However if I click that same link to open it on browser it loads just fine.
Here is some part of the code:
link = "https://api-mainnet.magiceden.io/rpc/getListedNFTsByQuery?q=%7B%22%24match%22%3A%7B%22collectionSymbol%22%3A%22"+str(nameCollection)+"%22%7D%2C%22%24sort%22%3A%7B%22takerAmount%22%3A1%2C%22createdAt%22%3A-1%7D%2C%22%24skip%22%3A0%2C%22%24limit%22%3A5%7D"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
}
# make the request from the website and convert the data into a JSON file
try:
# print(resp) will show all the data collected
resp = requests.get(link, headers=headers).json()
except:
return None
It is strange because some weeks ago everything worked perfect. So maybe the website changed the way of accessing data recently?
Thank you in advance for your help :)
You may have more success with urllib3 as follows:
import sys
import io
import json
import urllib3
URL = 'https://api-mainnet.magiceden.io/rpc/getListedNFTsByQuery?q={"$match":{"collectionSymbol":"meekolony"},"$sort":{"takerAmount":1,"createdAt":-1},"$skip":0,"$limit":5}'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:97.0) Gecko/20100101 Firefox/97.0'
}
with urllib3.PoolManager() as http:
try:
if (r := http.request('GET', URL, headers=HEADERS, preload_content=False)).status != 200:
raise Exception(f'HTTP status {r.status}')
r.auto_close = False
j = json.load(io.TextIOWrapper(r))
print(json.dumps(j, indent=2))
except Exception as e:
print(e, file=sys.stderr)

Python requests GET not getting the JSON payload?

I am trying to get the JSON data from the following URL:
import requests as r
url = "https://www.nseindia.com/json/CorporateFiling/CF-corpactions-equity.json"
header = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
"X-Requested-With": "XMLHttpRequest"
}
resp = r.get(url, stream=True, timeout=20, headers=header)
j = resp.json()
I get the JSON from doing this, but in the inspection I see the data is in the Response payload part, which is not in j.
I have never faced this problem before and my search lead me to POST questions.
I tested it using postman
User-Agent value is your problem
you could simply remove it and it will work
I might be wrong and didn't get question correctly, but compering data getting from UI and compering data getting from API are the same:
import json
import requests
from selenium import webdriver
url = 'https://www.nseindia.com/json/CorporateFiling/CF-corpactions-equity.json'
header = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
"X-Requested-With": "XMLHttpRequest"
}
driver = webdriver.Chrome()
driver.get(url)
content = driver.find_element_by_xpath('//pre').text
driver.quit()
response = requests.get(url,
stream=True,
timeout=20,
headers=header
)
print(json.loads(content) == response.json())
assert json.loads(content) == response.json()

Download a pdf file from a website that requires log in using requests, python3

I have a website that I want to download a pdf using request, the website requires you to log in then you can access the pdf file.
I am using this script but it isn't working, what is the problem? I used some code from another post, but couldn't figure out how to resolve this issue!!!
import requests
import sys
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
login_data = {
'Email': 'My-email',
'Password': 'My-password',
'login': 'Login'
}
url = 'https://download-website' #The website i want to download the file from
filename = 'filename.pdf'
# creating a connection to the pdf
print("Creating the connection ...")
with requests.session() as s:
url1 = 'https://login-website/' #The website i want to log in into
r = s.get(url1, headers=headers, stream=True)
soup = BeautifulSoup(r.content, 'html5lib')
login_data['__RequestVerificationToken'] = soup.find('input', attrs={'name':'__RequestVerificationToken'})['value']
r = s.post(url1, data=login_data, headers=headers, stream=True)
with requests.get(url, stream=True) as r:
if r.status_code != 200:
print("Could not download the file '{}'\nError Code : {}\nReason : {}\n\n".format(
url, r.status_code, r.reason), file=sys.stderr)
else:
# Storing the file as a pdf
print("Saving the pdf file :\n\"{}\" ...".format(filename))
with open(filename, 'wb') as f:
try:
total_size = int(r.headers['Content-length'])
saved_size_pers = 0
moversBy = 8192*100/total_size
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
saved_size_pers += moversBy
print("\r=>> %.2f%%" % (
saved_size_pers if saved_size_pers <= 100 else 100.0), end='')
print(end='\n\n')
except Exception:
print("==> Couldn't save : {}\\".format(filename))
f.flush()
r.close()
r.close()
I can only guess, because I do not know the link to the website. Try to write the keys of the user data in lower case. If that doesn't work try to find out what the registration form of the website expects with the developer tools of your browser.

Python - Urllib.Request - Change location of downloaded file

How can i choose the position in which the downloaded file is stored? My code:
import urllib.request
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers = {'User-Agent':user_agent,}
request = urllib.request.Request("http://download.thinkbroadband.com/5MB.zip",None,headers)
response = urllib.request.urlopen(request)
data = response.read()
You're almost there. So you've got data:
ofile = open(where_you_want_to_store_the_data,"wb")
ofile.write(data)
ofile.close()
For a cleaner way you can use urlretrieve:
urlretrieve(url, "/path/to/something.txt")

Downloading a webpage using urllib2 results in garbled junk? (only sometimes)

How come I hit this webpage, I get HTML text:
http://itunes.apple.com/us/app/mobile/id381057839
But when I hit this webpage, I get garbled junk?
http://itunes.apple.com/us/app/mobile/id375562663
I use the same download() function in python, which is here:
def download(source_url):
try:
socket.setdefaulttimeout(10)
agent = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.10) Gecko/20100914 AlexaToolbar/alxf-1.54 Firefox/3.6.10 GTB7.1"
ree = urllib2.Request(source_url)
ree.add_header('User-Agent',agent)
ree.add_header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
ree.add_header("Accept-Language","en-us,en;q=0.5")
ree.add_header("Accept-Charset","ISO-8859-1,utf-8;q=0.7,*;q=0.7")
ree.add_header("Accept-Encoding","gzip,deflate")
ree.add_header("Host","itunes.apple.com")
resp = urllib2.urlopen(ree)
htmlSource = resp.read()
return htmlSource
except Exception, e:
print e
Solved. It was compression issue.
def download(source_url):
try:
socket.setdefaulttimeout(10)
agents = ['Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)','Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)','Microsoft Internet Explorer/4.0b1 (Windows 95)','Opera/8.00 (Windows NT 5.1; U; en)']
ree = urllib2.Request(source_url)
ree.add_header('User-Agent',random.choice(agents))
ree.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
h = opener.open(ree).read()
import StringIO
import gzip
compressedstream = StringIO.StringIO(h)
gzipper = gzip.GzipFile(fileobj=compressedstream)
data = gzipper.read()
return data
except Exception, e:
print e
return ""

Categories

Resources