Python: splitting parsed items into a CSV file - python

I got advice from Jamie Bull and PM 2Ring to use the CSV module for the output of my web scraper . I'm nearly done but have an issue with some parsed items that are separated by a colon or hyphen. I'm wanting those items split into two items in the current list.
Current output:
GB,16,19,255,1,26:40,19,13,4,2,6-12,0-1,255,57,4.5,80,21,3.8,175,23-33,4.9,3,14,1,4,38.3,8,65,1,0
Sea,36,25,398,1,33:20,25,8,13,4,4-11,1-1,398,66,6.0,207,37,5.6,191,19-28,6.6,1,0,0,2,33.0,4,69,2,1
Desired output:(The issues/differences are in bold)
GB,16,19,255,1,26,40,19,13,4,2,6,12,0,1,255,57,4.5,80,21,3.8,175,23,33,4.9,3,14,1,4,38.3,8,65,1,0
Sea,36,25,398,1,33,20,25,8,13,4,4,11,1,1,398,66,6,207,37,5.6,191,19,28,6.6,1,0,0,2,33,4,69,2,1
I am unsure where or how to make these changes. I also don't know if regex is needed. Obviously I could handle this in notepad or Excel but my goal is to handle all this in Python.
If you run the program, the above results are from the 2014 season, week 1.
import requests
import re
from bs4 import BeautifulSoup
import csv
year_entry = raw_input("Enter year: ")
week_entry = raw_input("Enter week number: ")
week_link = requests.get("http://sports.yahoo.com/nfl/scoreboard/?week=" + week_entry + "&phase=2&season=" + year_entry)
page_content = BeautifulSoup(week_link.content)
a_links = page_content.find_all('tr', {'class': 'game link'})
csvfile = open('NFL_2014.csv', 'a')
writer = csv.writer(csvfile)
for link in a_links:
r = 'http://www.sports.yahoo.com' + str(link.attrs['data-url'])
r_get = requests.get(r)
soup = BeautifulSoup(r_get.content)
stats = soup.find_all("td", {'class':'stat-value'})
teams = soup.find_all("th", {'class':'stat-value'})
scores = soup.find_all('dd', {"class": 'score'})
try:
away_game_stats = []
home_game_stats = []
statistic = []
game_score = scores[-1]
game_score = game_score.text
x = game_score.split(" ")
away_score = x[1]
home_score = x[4]
home_team = teams[1]
away_team = teams[0]
away_team_stats = stats[0::2]
home_team_stats = stats[1::2]
away_game_stats.append(away_team.text)
away_game_stats.append(away_score)
home_game_stats.append(home_team.text)
home_game_stats.append(home_score)
for stats in away_team_stats:
text = stats.text.strip("").encode('utf-8')
away_game_stats.append(text)
writer.writerow(away_game_stats)
for stats in home_team_stats:
text = stats.text.strip("").encode('utf-8')
home_game_stats.append(text)
writer.writerow(home_game_stats)
except:
pass
csvfile.close()
Any help is greatly appreciated. This is my first program and searching this board has been a great resource.
Thanks,
JT

You can use regular expressions to split the strings and then "flatten" the list in order to avoid the grouping by quotation marks like this:
Substitute
writer.writerow(away_game_stats)
with
away_game_stats = [re.split(r"-|:",x) for x in away_game_stats]
writer.writerow([x for y in away_game_stats for x in y])
(and same for writer.writerow(home_game_stats))

import re
print re.sub(r"-|:",",",test_string)
See demo.
https://regex101.com/r/aQ3zJ3/2

Related

How to convert your script into a class?

I wrote a code to convert some text in pdf file into a pandas dataframe. Code works very well normally, but when I try to fit it into class and define function for it, it returns with error.
import pdfplumber
import pandas as pd
import re
cols = ["Declaration Number", "Declaration Date", "Warehouse", "Quantity", "Number of boxes", "Product name", "Invoice Number"]
dataset = []
quant = []
date = []
decl_date = []
decl = re.compile(r'\d{8}AN\d{6}')
decd = re.compile(r'\d{2}\.\d{2}\.\d{4}')
whse = re.compile(r'ANTREPO | LİMAN')
qty = re.compile(r'\d.KAP')
prod = re.compile(r'Ticari')
invNo = re.compile(r'Fatura')
class pdf():
def __init__(self):
self.kap = None
self.kg = None
def FirstPage():
with pdfplumber.open("44550500AN087999.pdf") as pdf:
page = pdf.pages[0]
text = page.extract_text()
for line in text.split('\n'):
if decl.search(line):
decl_num = line.split()[-1]
if decd.search(line):
decl_date = []
date = []
decl_date.append(line.split())
date = decl_date[1][-1]
if whse.search(line):
warehouse = line.split()
if qty.search(line):
quant = line.split()
kap = quant[0] + " " + quant[1]
kg = quant[2] + " " + quant[3]
when I run it it returns with several errors:
For instance:
<ipython-input-26-bc082b4afef0> in FirstPage()
20 date = []
21 decl_date.append(line.split())
---> 22 date = decl_date[1][-1]
23 if whse.search(line):
24 warehouse = line.split()
IndexError: list index out of range
I am probably defining the variables wrong but I am a newby so, anyone have any idea what am I doing wrong?
You are only putting one element into decl_date, and then trying to access the second element inside that list, which does not exist.
Your use of line.split() seems incorrect to me. The way you have used them essentially only puts the string into a 1-element list "string" -> ["string"].
I assume you want to split the string by using the regex in each if-statement, in that case change line.split() to pattern.split(line)[index], swapping out pattern and index

How to get a mean for a int variable within 6 month based on a variable dates

I would like to know how I could have the mean of notes by grouping dates within 6 months. In other words, let's say I want the notes mean for all the comments between 01/01/2020 and 30/06/2020 and also between 01/07/2020 and 31/12/2020.
You get the idea :)
I also would like to know the number of comments within 6 months.
But I suppose this is quite the same process.
Here’s some row of my database :
Here’s how I obtained her with web scraping :
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://fr.trustpilot.com/review/www.gammvert.fr'
urls = [ '{root}?page={i}'.format(root=root_url, i=i) for i in range(1,807) ]
comms = []
notes = []
dates = []
for url in urls:
results = requests.get(url)
time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
commentary = soup.find_all('section', class_='review__content')
for container in commentary:
try:
comm = container.find('p', class_ = 'review-content__text').text.strip()
except:
comm = container.find('a', class_ = 'link link--large link--dark').text.strip()
comms.append(comm)
note = container.find('div', class_ = 'star-rating star-rating--medium').find('img')['alt']
notes.append(note)
date_tag = container.div.div.find("div", class_="review-content-header__dates")
date = json.loads(re.search(r"({.*})", str(date_tag)).group(1))["publishedDate"]
dates.append(date)
data = pd.DataFrame({
'comms' : comms,
'notes' : notes,
'dates' : dates
})
data['comms'] = data['comms'].str.replace('\n', '')
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
#print(data.head())
data.to_csv('file.csv', sep=';', index=False)
Here’s the function I used to obtained my comms_clean and month:
def clean_text(text):
text = tokenizer.tokenize(text)
text = nltk.pos_tag(text)
text = [word for word,pos in text if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')
]
text = [word for word in text if not word in stop_words]
text = [word for word in text if len(word) > 2]
final_text = ' '.join( [w for w in text if len(w)>2] ) #remove word with one letter
return final_text
data['comms_clean'] = data['comms'].apply(lambda x : clean_text(x))
data['month'] = data.dates.dt.strftime('%Y-%m')
I suppose we can get that with .dt in datetime packages but I didn't find out how. Do you have any idea or indication in order to get that ?
Thank you :)

CSV -(excel)- Python. Seems like wrong writing on csv from python

I´m trying to export some data from a website and I first tried on one single page. I´ve to import text delimited by titles:
['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature
References','Additional Information','Approval Date','Date Created','Company Name']
The url is https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus
The code currently works, it gives me all the data. But when I insert it on the CSV , the information is not delimited as I wish.
As it is one single page, the excel should have ONE row... but it doesn´t
The code:
from bs4 import BeautifulSoup
import requests
import csv
csv_file = open('Drugs.csv','w')
csv_writer = csv.writer(csv_file, delimiter ='+')
csv_writer.writerow(['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature References','Additional Information','Approval Date','Date Created','Company Name'])
link = requests.get('https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus')
aux =[]
soup = BeautifulSoup(link.content, 'lxml')
drugName = soup.find('div', class_='company-navigation').find('h1').text
gralInfo = soup.find('div', class_='body directory-listing-profile__description')
y = 0
for h2 in gralInfo.find_all('h2'):
print (y)
text =''
for sibling in h2.find_next_siblings():
if (sibling.name == 'h2'):
break
else:
text = text + sibling.get_text(separator ='\n') + '\n'
print(text)
aux.append(text)
print()
print()
y = y + 1
auxi = []
for info in soup.find_all('div', class_='contact directory-listing-profile__master-detail'):
print(info.text)
auxi.append(info.text)
csv_writer.writerow([drugName, aux[0], aux[1], aux[2], aux[3], aux[4], aux[5], auxi[0], auxi[1], auxi[2]])

Can't figure out how to properly output my data

I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).
Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.
most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.

Shaking off duplicates while parsing

I've made a parser written in python which is doing it's job perfectly except for some duplicates coming along. Moreover, when I open csv file I can see that every result is surrounded by square braces. Is there any workaround to get rid of duplicates data and square braces on the fly? Here is what I tried with:
import csv
import requests
from lxml import html
def parsingdata(mpg):
data = set()
outfile=open('RealYP.csv','w',newline='')
writer=csv.writer(outfile)
writer.writerow(["Name","Address","Phone"])
pg=1
while pg<=mpg:
url="https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=Los%20Angeles%2C%20CA&page="+str(pg)
page=requests.get(url)
tree=html.fromstring(page.text)
titles = tree.xpath('//div[#class="info"]')
items = []
for title in titles:
comb = []
Name = title.xpath('.//span[#itemprop="name"]/text()')
Address = title.xpath('.//span[#itemprop="streetAddress" and #class="street-address"]/text()')
Phone = title.xpath('.//div[#itemprop="telephone" and #class="phones phone primary"]/text()')
try:
comb.append(Name[0])
comb.append(Address[0])
comb.append(Phone[0])
except:
continue
items.append(comb)
pg+=1
for item in items:
writer.writerow(item)
parsingdata(3)
Now it is working fine.
Edit: Rectified portion taken from bjpreisler
This script removes dups when I am working with a .csv file. Check if this works for you :)
with open(file_out, 'w') as f_out, open(file_in, 'r') as f_in:
# write rows from in-file to out-file until all the data is written
checkDups = set() # set for removing duplicates
for line in f_in:
if line in checkDups: continue # skip duplicate
checkDups.add(line)
f_out.write(line)
You are currently writing a list (items) to the csv which is why it is in brackets. To avoid this, use another for loop that could look like this:
for title in titles:
comb = []
Name = title.xpath('.//span[#itemprop="name"]/text()')
Address = title.xpath('.//span[#itemprop="streetAddress" and #class="street-address"]/text()')
Phone = title.xpath('.//div[#itemprop="telephone" and #class="phones phone primary"]/text()')
if Name:
Name = Name[0]
if Address:
Address = Address[0]
if Phone:
Phone = Phone[0]
comb.append(Name)
comb.append(Address)
comb.append(Phone)
print comb
items.append(comb)
pg+=1
for item in items:
writer.writerow(item)
parsingdata(3)
This should write each item separately to your csv. It turns out the items you were appending to comb were lists themselves, so this extracts them.
And the concise version of this scraper I found lately is:
import csv
import requests
from lxml import html
url = "https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=Los%20Angeles%2C%20CA&page={0}"
def parsingdata(link):
outfile=open('YellowPage.csv','w',newline='')
writer=csv.writer(outfile)
writer.writerow(["Name","Address","Phone"])
for page_link in [link.format(i) for i in range(1, 4)]:
page = requests.get(page_link).text
tree = html.fromstring(page)
for title in tree.xpath('//div[#class="info"]'):
Name = title.findtext('.//span[#itemprop="name"]')
Address = title.findtext('.//span[#itemprop="streetAddress"]')
Phone = title.findtext('.//div[#itemprop="telephone"]')
print([Name, Address, Phone])
writer.writerow([Name, Address, Phone])
parsingdata(url)

Categories

Resources