Python In The Shell: A Way To Gather Steemit User Info – Post #105
Web URL Scraping And Python
Have you ever wondered how to gather information of a STEEMIT user’s home page? In this new series of discussions I will try to lay out how we can do this in python, and more.
Things like title of the post, who upvoted, what comments and even the total rewards of each post can be seen as we click on a certain post as laid out from the user’s homepage.
So how to gather those important information using python?
The full script below:
1 #!/usr/bin/python3.6
2
3 ###MODULES
4 from steem import Steem
5 import sys, os
6 import json
7 import shutil
8 import requests
9 from bs4 import BeautifulSoup as soup
10
11 ###MAKE TEMP DIR
12 tempdir = '/dev/shm/steemblogs'
13 shutil.rmtree(tempdir, ignore_errors=True)
14 os.mkdir(tempdir)
15
16 ###NODE SOURCE
17 s = Steem(nodes=["https://api.steemit.com"])
18 ###GET ALL BLOGS FROM AUTHOR
19 all = s.get_blog(sys.argv[1], entry_id=-1, limit=500)
20
21 ###DISPLAY
22 for a in all:
23 #blogs = jq(".").transform(json.dumps(a))
24 blogs = json.dumps(a)
25 with open(tempdir + '/steemblogs', "a") as f:
26 f.write(blogs + '\n')
27 f.close()
28
29 ###PROCESS GATHERED INFOS FOR COMPLETE URL PER BLOG POST
30 ###WRITE TO FILE FIRST
31 with open('/dev/shm/steemblogs/steemblogsb', 'w') as fff:
32 bbb = (os.popen('cat /dev/shm/steemblogs/steemblogs | grep \'"author": "' + sys.argv[1] + '"\'')).read()
33 fff.write(bbb)
34 fff.close()
35
36 ###OPEN THE FILE AND PROCESS
37 with open('/dev/shm/steemblogs/steemblogsb', 'r') as bb:
38 for b in bb:
39 #FORMAT IN PYTHON JSON FIRST
40 comment = json.loads(b)
41
42 ###AUTHOR
43 author = sys.argv[1]
44 ###PERMLINK
45 permlink = comment['comment']['permlink']
46 ###CATEGORY
47 category = comment['comment']['category']
48 ###FULL URL
49 my_url = 'https://steemit.com/' + category + '/@' + author + '/' + permlink
50 ###SCRAPE EACH URL LINK FOR EVRY POST
51 headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
52 open_url = requests.get(my_url, headers=headers)
53 html_url = open_url.text
54 open_url.close()
55 #FORMAT AS SOUP FOR EASY PARSING
56 soup_url = soup(html_url, 'lxml')
57
58 ###HARVEST NEEDED INFO FROM THE SCRAPED URL
59
60 #FULL URL LINK
61 print('LINK:\n' + ' ' + my_url)
62
63 #POST TITLE
64 ttt = soup_url.find('h1', {'class':'entry-title'})
65 title = ttt.text
66 print('TITLE:\n' + ' ' + title)
67
68 #PYTHON WAY WHO UPVOTED
69 voters = s.get_active_votes(sys.argv[1], permlink)
70 ###RESULTS IN A LIST ROUTINE
71 vv = []
72 for v in voters:
73 #vv += v['voter']
74 vv.append(str(v['voter']))
75 print('UPVOTERS:\n' + ' ' + str(vv))
76
77 #REWARDS IN USD
78 rrr = soup_url.findAll('div', {'class':'DropdownMenu'})
79 try:
80 rewards = (rrr[0].a.span.span).text.replace(' ', '')
81 print('REWARDS:\n' + ' ' + rewards)
82 except:
83 print(' REWARDS: $0.00')
84
85 #COMMENTS
86 ccc = soup_url.findAll('div', {'class':'Comment__body entry-content'})
87 c = []
88 for cc in ccc:
89 c.append(cc.text.strip())
90 if not c == []:
91 print('COMMENTS:\n' + ' ' + str(c))
92 else:
93 print(' Comments not found')
94
95 #BODY
96 bbb = soup_url.find('div', {'class':'PostFull__body entry-content'})
97 body = bbb.text
98 print('BODY:\n' + ' ' + str(body))
99
100 #ENDING COMMANDS
101 print('\n')
102 #sys.exit()
103 bb.close()
104
105 ###CLEAN UP FOLDERS
106 shutil.rmtree(tempdir, ignore_errors=True)
I already indicated the line numbers so that we will go over the routines easily for the next posts.
This will be another level of python programming that can teach us many ways on how to solve the gathering of complex data structures the python way.
“There Are Many Ways To Skin A Cat… Python Is One.”