Write JSON elements to csv file python - python

I just created a python program which scrapes google webmaster API to check if the target_site is mobile friendly or not & then based on response it extracts certain Json elements. Also it captures the screenshot on local folder
the script is working fine,BUT When i am trying to write those json objects to csv file ,its not working .
Here is my code :-
import requests, json, string, random, time
import csv
from base64 import decodestring
from random import randint
#links = open(r'D:\\Carlos\\Links.txt')
links = ['https://www.googleapis.com/pagespeedonline/v3beta1/mobileReady?key=AIzaSyDkEX-f1JNLQLC164SZaobALqFv4PHV-kA&screenshot=true&snapshots=true&locale=en_US&url=https://www.economicalinsurance.com/en/&strategy=mobile&filter_third_party_resources=false',
'https://www.googleapis.com/pagespeedonline/v3beta1/mobileReady?key=AIzaSyDkEX-f1JNLQLC164SZaobALqFv4PHV-kA&screenshot=true&snapshots=true&locale=en_US&url=http://www.volkswagen-me.com/en-vwme/service/protection/motor-insurance.html&strategy=mobile&filter_third_party_resources=false']
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
i = 12
def get_data(each):
try:
r = requests.get(each)
except:
pass
#time.sleep(randint(1, 3))
try:
json_data = json.loads(r.text)
except:
pass
try:
score = json_data['ruleGroups']['USABILITY']['score'];score=int(score)
except:
pass
try:
Pass = json_data['ruleGroups']['USABILITY']['pass'];Pass=str(Pass)
except:
pass
try:
ConfigureViewport = json_data['formattedResults']['ruleResults']['ConfigureViewport']['localizedRuleName'];ConfigureViewport=str(ConfigureViewport)
except:
pass
try:
UseLegibleFontSizes = json_data['formattedResults']['ruleResults']['UseLegibleFontSizes']['localizedRuleName'];UseLegibleFontSizes=str(UseLegibleFontSizes)
except:
pass
try:
AvoidPlugins = json_data['formattedResults']['ruleResults']['AvoidPlugins']['localizedRuleName'];AvoidPlugins=str(AvoidPlugins)
except:
pass
try:
SizeContentToViewport = json_data['formattedResults']['ruleResults']['SizeContentToViewport']['localizedRuleName'];SizeContentToViewport=str(SizeContentToViewport)
except:
pass
try:
SizeTapTargetsAppropriately = json_data['formattedResults']['ruleResults']['SizeTapTargetsAppropriately']['localizedRuleName'];SizeTapTargetsAppropriately=str(SizeTapTargetsAppropriately)
except:
pass
try:
AvoidInterstitials = json_data['formattedResults']['ruleResults']['AvoidInterstitials']['localizedRuleName'];AvoidInterstitials=str(AvoidInterstitials)
except:
pass
try:
image_link = json_data['screenshot']['data']; image_link = image_link.replace("_", "/").replace("-","+")
except:
pass
#try:
id_generator_name = "".join( [random.choice(string.letters) for i in xrange(15)] )+'.jpeg'
#except:
# pass
#try:
fh = open(id_generator_name, "wb")
#except:
# pass
try:
fh.write(str(image_link).decode('base64'))
time.sleep(1)
except:
pass
try:
fh.close()
except:
pass
try:
error_code = json_data['error']['message'];error_code=str(error_code)
except:
pass
try:
print each, score, Pass, ConfigureViewport, UseLegibleFontSizes, AvoidPlugins, SizeContentToViewport, SizeTapTargetsAppropriately, AvoidInterstitials, error_code
except:
pass
try:
writer.writerow({'each':each, 'score':score, 'Pass':Pass, 'ConfigureViewport':ConfigureViewport,
'UseLegibleFontSizes':UseLegibleFontSizes, 'AvoidPlugins':AvoidPlugins,
'SizeContentToViewport':SizeContentToViewport,'SizeTapTargetsAppropriately':SizeTapTargetsAppropriately,
'AvoidInterstitials':AvoidInterstitials, 'error_code':error_code,'imagename':id_generator_name})
except:
pass
#path to the csv file
with open("D:\Carlos\Data_file\output.csv", "ab")as export:
fieldnames = ['each', 'score', 'Pass', 'ConfigureViewport', 'UseLegibleFontSizes', 'AvoidPlugins', 'SizeContentToViewport',
'SizeTapTargetsAppropriately', 'AvoidInterstitials', 'error_code','imagename']
writer = csv.DictWriter(export, fieldnames=fieldnames)
writer.writeheader()
for each in links:
#try:
get_data(each)
#except:
# pass
Please advice on how to write to csv ? Or where things are wrong in the code ?

I like to use Pandas dataframes for this, but it may be overkill if you wouldn't use Pandas otherwise. Pandas dataframes are also great for analysis and comparison.
You would put the JSON into a dataframe, and then output the dataframe to a CSV file.
import pandas as pd
df = pd.read_json('path/to/json/file')
df.to_csv('filename.csv')
Note that it's this simple only when your JSON has one level and might as well be a csv. Otherwise, you would need to read the JSON into a dict, navigate to the appropriate level and then read that into a dataframe.
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
http://pandas.pydata.org/

Related

python threading to download list of dictionaries

I have a list of 1000 dictionary each dictionary contains url,file_name
for di in images_to_download:
temp = download_image(di['img_url'],di['image_full_name'])
if not temp:
continue
upload_file(temp,t4['id'])
def download_image(img_url,image_full_name)
try:
req = request(img_url, headers=headers)
raw_img = urlopen(req).read()
try:
with open(image_full_name,'wb') as file:
file.write(raw_img)
except:
image_full_name = query+str(random.randint(100000))
with open(image_full_name,'wb') as file:
file.write(raw_img)
return image_full_name
except Exception as e:
print ("Download failed: {}".format(e))
return None
def upload_file(file_name,folder_id):
file2 = drive.CreateFile({'parents': [{'id': folder_id}]})
file2.SetContentFile(file_name)
file2.Upload()
This is my code so far it works well, but it is slow (3-6 sec per image), so I would like to thread it How can I do that?
The images are from different websites so I can download them without too fast requests on website.

Unable to store pandas data frame as a csv

I am following this tutorial to retrieve data from news sites.
The main function is getDailyNews. It will loop on each news source, request the api, extract the data and dump it to a pandas DataFrame and then export the result into csv file.
But when I ran the code, I am getting an error.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from tqdm import tqdm, tqdm_notebook
from functools import reduce
def getSources():
source_url = 'https://newsapi.org/v1/sources?language=en'
response = requests.get(source_url).json()
sources = []
for source in response['sources']:
sources.append(source['id'])
return sources
def mapping():
d = {}
response = requests.get('https://newsapi.org/v1/sources?language=en')
response = response.json()
for s in response['sources']:
d[s['id']] = s['category']
return d
def category(source, m):
try:
return m[source]
except:
return 'NC'
def getDailyNews():
sources = getSources()
key = '96f279e1b7f845669089abc016e915cc'
url = 'https://newsapi.org/v1/articles?source={0}&sortBy={1}&apiKey={2}'
responses = []
for i, source in tqdm_notebook(enumerate(sources), total=len(sources)):
try:
u = url.format(source, 'top', key)
except:
u = url.format(source, 'latest', key)
response = requests.get(u)
r = response.json()
try:
for article in r['articles']:
article['source'] = source
responses.append(r)
except:
print('Rate limit exceeded ... please wait and retry in 6 hours')
return None
articles = list(map(lambda r: r['articles'], responses))
articles = list(reduce(lambda x,y: x+y, articles))
news = pd.DataFrame(articles)
news = news.dropna()
news = news.drop_duplicates()
news.reset_index(inplace=True, drop=True)
d = mapping()
news['category'] = news['source'].map(lambda s: category(s, d))
news['scraping_date'] = datetime.now()
try:
aux = pd.read_csv('./data/news.csv')
aux = aux.append(news)
aux = aux.drop_duplicates('url')
aux.reset_index(inplace=True, drop=True)
aux.to_csv('./data/news.csv', encoding='utf-8', index=False)
except:
news.to_csv('./data/news.csv', index=False, encoding='utf-8')
print('Done')
if __name__=='__main__':
getDailyNews()
Error:
FileNotFoundError: [Errno 2] No such file or directory: './data/news.csv'
I know that I have to give the path name in pd.read_csv but I don't know which path I have to give here.
This error would make sense if there wasn't already a data folder in the directory you are executing this program from. There is a similar problem in the post here.

Python check if website exists for a list of websites

I want to check if a website exists, given a list of websites in the format XXXXX.com, where XXXXX=a 5 digit number. So I want to go through from 00000 up to 99999 and see if those variants of the website exist.
I want to do something like
import requests
request = requests.get('http://www.example.com')
if request.status_code == 200:
print('Web site exists')
else:
print('Web site does not exist')
But generate a list of some sort (or even just export a list to csv), so for each URL, i know if it exists or not.
Any advice would be great!
I'm going to make an assumption that you have a large list of URLs and you want to read them in from some source file, let's say a text file, rather than hard-coding a large list of URLs in Python file, right. If that's the case, run the script below and you'll get what you want.
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
start = time.time()
file = open('C:\\your_path\\check_me.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
print(urls)
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: {}'.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: {}'.format(e.reason) + ', ' + url)
else:
# 200
# ...
print('good' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=20)
result = p.map(checkurl, urls)
print("done in : ", time.time()-start)
Try combining xrange and the string zfill method in a loop.
import requests
def test_for_200(url):
req = requests.get(url)
return req.status_code == 200
def numbers():
for n in xrange(100000):
yield str(n).zfill(5)
results = {}
for num in numbers():
url = "http://{}.com".format(num)
results[num] = test_for_200(url)
results will look something like this:
>>> results
{'00000': True, '00001': False, ...}

Output table contents with limit and filter

I'm finding the boto dynamoDB documentation lacking almost completely of examples.
In Python, I simply want to output the contents of a table with a limit of a number of records, say 500 of the latest ones, from a certain date.
Here is what I have...
import boto.dynamodb
import sys
#----------PUBLIC VARIABLES--------------------------#
connection = boto.dynamodb.connect_to_region(
'us-east-1',
aws_access_key_id='somekey',
aws_secret_access_key='somesecretkey')
#----------------------------------------------------#
def info():
print('#########################_TABLE_NAMES_#########################')
#get and print list of tables
tablenames = connection.list_tables()
for table in tablenames:
print('DynamoDB table: %s' % table)
#print(connection.describe_table(table))
print('###############################################################' + '\n')
def main():
print('###########################_RESULTS_###########################')
scan = myTable.scan(scan_filter=None, attributes_to_get=['SomeField'])
results = []
for x in scan:
results.append(x['SomeField'])
print('###############################################################' + '\n')
def writeError(error):
try:
f = open("error.txt", "w")
try:
f.write(error) # Write a string to a file
finally:
f.close()
except IOError:
print "WriteError - Error!"
if __name__ == '__main__':
try:
info()
main()
except:
writeError("Unexpected error:" + str(sys.exc_info()))
print "Error"
The table I have hasn't got any custom indexes so I'd be looking for something pretty basic as an example.
I'm sorry I don't have a better attempt, but I've researched and not found a lot to go on.
I've modified your script to print out the first 500 scan results for each table. Don't forget to correct the field name (I put someField):
import boto.dynamodb2
from boto.dynamodb2.table import Table
import sys
#----------PUBLIC VARIABLES--------------------------#
connection = boto.dynamodb2.connect_to_region(
'us-east-1')
#----------------------------------------------------#
def getTableNames():
'''get list of tables'''
tablenames = connection.list_tables()["TableNames"]
return tablenames
def main(tablenames=[]):
print('###########################_RESULTS_###########################')
for table in tablenames:
print "Table Name: " + table
myTable = Table(table)
scan = myTable.scan()
results = []
for item in scan:
if len(results) >= 500:
break
results.append(item.get('someField'))
for result in results:
print result
print('###############################################################' + '\n')
def writeError(error):
try:
f = open("error.txt", "w")
try:
f.write(error) # Write a string to a file
finally:
f.close()
except IOError:
print "WriteError - Error!"
if __name__ == '__main__':
try:
tablenames = getTableNames()
main(tablenames)
except:
writeError("Unexpected error:" + str(sys.exc_info()))
print "Error"
Please note that DynamoDB doesn't provide scan results in any order. If you want them ordered by the latest changes, you can use a solution based on DynamoDB Streams https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.html or add a secondary index: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GSI.html

Using WorkerPool to multithread through a list of URLs

I'm trying to use multithreads to go through a txt file of urls and scrape the contents found at each url. This works for about 20 URLs (not consistent how many) but then consistently gets stuck on the last url in the file. It doesn't seem to be doing them in order.
I have no idea why it's getting stuck or where to start so thank you so much for your help.
from bs4 import BeautifulSoup, SoupStrainer
import urllib3
import urllib2
import io
import os
import re
import workerpool
from urllib2 import Request, urlopen, URLError
NUM_SOCKETS = 3
NUM_WORKERS = 5
urlfile = open("dailynewsurls.txt",'r') # read one line at a time until end of file
http = urllib3.PoolManager(maxsize=NUM_SOCKETS)
workers = workerpool.WorkerPool(size=NUM_WORKERS)
class MyJob(workerpool.Job):
def __init__(self, url):
self.url = url
def run(self):
r = http.request('GET', self.url)
req = urllib2.Request(url)
try:
page = urllib2.urlopen(req)
except:
print "had to skip one"
return
pagecontent = page.read() # get a file-like object at this url
#this tells it to soup the page that is at the url above
soup = BeautifulSoup(pagecontent)
#this tells it to find the string in the first instance of each of the tags in the parenthesis
title = soup.find_all('title')
article = soup.find_all('article')
try:
title = str(title[0].get_text().encode('utf-8'))
except:
print "had to skip one"
return
try:
article = str(article[0].get_text().encode('utf-8'))
except:
print "had to skip one"
return
try:
# make the file using the things above
output_files_pathname = 'DailyNews/' # path where output will go
new_filename = title + ".txt"
# write each of the things defined into the text file
outfile = open(output_files_pathname + new_filename,'w')
outfile.write(title)
outfile.write("\n")
outfile.write(article)
outfile.close()
print "%r added as a text file" % title
return
except:
print "had to skip one"
return
return
for url in urlfile:
workers.put(MyJob(url))
workers.shutdown()
workers.wait()
print "All done."
Here's an example list of the urls:
http://www.nydailynews.com/entertainment/tv-movies/x-factor-season-2-episode-2-recap-oops-britney-spears-article-1.1159546
http://www.nydailynews.com/new-york/brooklyn/lois-mclohon-resurfaced-iconic-daily-news-coney-island-cheesecake-photo-brings-back-memories-50-year-long-romance-article-1.1160457
http://www.nydailynews.com/new-york/uptown/espaillat-linares-rivals-bitter-history-battle-state-senate-seat-article-1.1157994
http://www.nydailynews.com/sports/baseball/mlb-power-rankings-yankees-split-orioles-tumble-rankings-nationals-shut-stephen-strasburg-hang-top-spot-article-1.1155953
http://www.nydailynews.com/news/national/salon-sell-internet-online-communities-article-1.1150614
http://www.nydailynews.com/sports/more-sports/jiyai-shin-wins-women-british-open-dominating-fashion-record-nine-shot-victory-article-1.1160894
http://www.nydailynews.com/entertainment/music-arts/justin-bieber-offered-hockey-contract-bakersfield-condors-minor-league-team-article-1.1157991
http://www.nydailynews.com/sports/baseball/yankees/umpire-blown-call-9th-inning-dooms-yankees-5-4-loss-baltimore-orioles-camden-yards-article-1.1155141
http://www.nydailynews.com/entertainment/gossip/kellie-pickler-shaving-head-support-best-friend-cancer-fight-hair-article-1.1160938
http://www.nydailynews.com/new-york/secret-103-000-settlement-staffers-accused-assemblyman-vito-lopez-sexual-harassment-included-penalty-20k-involved-talked-details-article-1.1157849
http://www.nydailynews.com/entertainment/tv-movies/ricki-lake-fun-adds-substance-new-syndicated-daytime-show-article-1.1153301
http://www.nydailynews.com/sports/college/matt-barkley-loyalty-usc-trojans-contention-bcs-national-championship-article-1.1152969
http://www.nydailynews.com/sports/daily-news-sports-photos-day-farewell-andy-roddick-world-1-u-s-open-champ-retires-loss-juan-martin-del-potro-article-1.1152827
http://www.nydailynews.com/entertainment/gossip/britney-spears-made-move-relationship-fiance-jason-trawick-reveals-article-1.1152722
http://www.nydailynews.com/new-york/brooklyn/brooklyn-lupus-center-tayumika-zurita-leads-local-battle-disease-difficult-adversary-article-1.1153494
http://www.nydailynews.com/life-style/fashion/kate-middleton-prabal-gurung-dress-sells-hour-myhabit-site-sold-1-995-dress-599-article-1.1161583
http://www.nydailynews.com/news/politics/obama-romney-campaigns-vie-advantage-president-maintains-lead-article-1.1161540
http://www.nydailynews.com/life-style/free-cheap-new-york-city-tuesday-sept-11-article-1.1155950
http://www.nydailynews.com/news/world/dozens-storm-embassy-compound-tunis-article-1.1159663
http://www.nydailynews.com/opinion/send-egypt-message-article-1.1157828
http://www.nydailynews.com/sports/more-sports/witnesses-feel-sheryl-crow-lance-amstrong-activities-article-1.1152899
http://www.nydailynews.com/sports/baseball/yankees/hiroki-kuroda-replacing-cc-sabathia-yankees-ace-pitcher-real-possibility-playoffs-looming-article-1.1161812
http://www.nydailynews.com/life-style/eats/finland-hosts-pop-down-restaurant-belly-earth-262-feet-underground-article-1.1151523
http://www.nydailynews.com/sports/more-sports/mighty-quinn-sept-23-article-1.1165584
http://www.nydailynews.com/sports/more-sports/jerry-king-lawler-stable-condition-suffering-heart-attack-wwe-raw-broadcast-monday-night-article-1.1156915
http://www.nydailynews.com/news/politics/ambassador-chris-stevens-breathing-libyans-found-american-consulate-rescue-article-1.1161454
http://www.nydailynews.com/news/crime/swiss-banker-bradley-birkenfeld-104-million-reward-irs-blowing-whistle-thousands-tax-dodgers-article-1.1156736
http://www.nydailynews.com/sports/hockey/nhl-board-governors-votes-favor-lockout-league-players-association-fail-reach-agreement-cba-article-1.1159131
http://www.nydailynews.com/news/national/iphone-5-works-t-network-article-1.1165543
http://www.nydailynews.com/sports/baseball/yankees/yankees-broadcasters-michael-kay-ken-singleton-opportunity-important-statement-article-1.1165479
http://www.nydailynews.com/news/national/boss-year-michigan-car-dealer-retires-employees-1-000-year-service-article-1.1156763
http://www.nydailynews.com/entertainment/tv-movies/hero-denzel-washington-clint-eastwood-article-1.1165538
http://www.nydailynews.com/sports/football/giants/ny-giants-secondary-roasted-tony-romo-dallas-cowboys-offense-article-1.1153055
http://www.nydailynews.com/news/national/hide-and-seek-tragedy-3-year-old-suffocates-hiding-bean-bag-article-1.1160138
I would try using the threading module; here is something I think is working:
from bs4 import BeautifulSoup, SoupStrainer
import threading
import urllib2
def fetch_url(url):
urlHandler = urllib2.urlopen(url)
html = urlHandler.read()
#this tells it to soup the page that is at the url above
soup = BeautifulSoup(html)
#this tells it to find the string in the first instance of each of the tags in the parenthesis
title = soup.find_all('title')
article = soup.find_all('article')
try:
title = str(title[0].get_text().encode('utf-8'))
except:
print "had to skip one bad title\n"
return
try:
article = str(article[0].get_text().encode('utf-8'))
except:
print "had to skip one bad article"
return
try:
# make the file using the things above
output_files_pathname = 'DailyNews/' # path where output will go
new_filename = title + ".txt"
# write each of the things defined into the text file
outfile = open(output_files_pathname + new_filename, 'w')
outfile.write(title)
outfile.write("\n")
outfile.write(article)
outfile.close()
print "%r added as a text file" % title
return
except:
print "had to skip one cant write file"
return
return
with open("dailynewsurls.txt", 'r') as urlfile:
# read one line at a time until end of file
threads = [threading.Thread(target=fetch_url, args=(url,)) for url in urlfile]
for thread in threads:
thread.start()
for thread in threads:
thread.join()

Categories

Resources