简书作者信息统计

从2016年就开始在简书上写文章,之所以选择这个平台,很大程度上是因为它洁简的后台输入界面,支持MD,非常适合写一些技术类的文章。

从最开始的零星几个粉丝,到粉丝数破百,到现在有700+粉丝,虽然离大牛们动辄数十万的粉丝还有很大的差距,但是看着粉丝数一天天上涨,感觉也很有成就感。于是就想着记录下粉丝数增涨的过程,以后当粉丝数上万后可以回头看看自己是怎么一步一步走到今天的(此处应该是意淫的表情)。但是,也许是简书太乞求简洁,后台都不提供作者粉丝数历史记录的,不像微信公众号、头条、熊掌号这些成熟的自媒体平台,都会给作者提供完善的相关数据记录,让作者可以清晰地跟踪自己粉丝数增长的曲线。

既然简书不提供,自己又希望有这个功能,又不想放弃简书这个平台,那就自己动手,丰衣足食吧。实现起来也很简单,思路如下:

  1. 写个爬虫,每天定时爬取自己简书账号的各项指标数据(包括粉丝数、喜欢数、文章数等)
  2. 写一个页面,用来做数据展示 简书信息统计展示页面
  3. 没了,就是这么简单。

Talk is cheap, Show you the code

上代码,这里附上爬虫的代码,展示页面做得还不是很完善,就不献丑了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- coding:utf-8 -*-
import time
import re
import random
import requests
from lxml import etree
from pymongo import MongoClient
client = MongoClient()

def randomUserAgent():
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]

return random.choice(USER_AGENTS)

def getResponse(url, **kwargs):
if 'headers' not in kwargs:
kwargs['headers'] = {
'User-Agent': randomUserAgent(),
}

r = requests.get(url, **kwargs)

return r

def getArticleInfo(user):
print("==getArticleInfo==")
uid = user['uid']
article_num = int(user['article_nums'])
PER_NUM = 9
max_page = int(article_num / PER_NUM) if (article_num % PER_NUM) == 0 else int(article_num / PER_NUM)+1
article_urls = ['https://www.jianshu.com/u/{}?order_by=shared_at&page={}'.format(uid, i) for i in
range(1, max_page+1)]

details = []
for article_url in article_urls:
r = getResponse(article_url)
dom = etree.HTML(r.text)
items = dom.xpath('//ul[@class="note-list"]/li')

for item in items:
# 对每个 li标签再提取
details_xpath = {
'link': './div/a/@href',
'title': './div/a/text()',
'read_num': './/div[@class="meta"]/a[1]/text()',
'comment_num': './/div[@class="meta"]/a[2]/text()',
'heart_num': './/div[@class="meta"]/span[1]/text()',
}

key_and_path = details_xpath.items()
detail = {}
for key, path in key_and_path:
detail[key] = ''.join(item.xpath(path)).strip()

try:
#将数字转换为整数
for key in ['read_num', 'comment_num', 'heart_num']:
detail[key] = int(detail[key])

details.append(detail)
except ValueError:
pass

#返回爬取结果
return details

def getUserInfo(uid):
print('==getUserInfo==')
url = 'https://www.jianshu.com/u/{}'.format(uid)
r = getResponse(url)
dom = etree.HTML(r.text)
user_info = dict()
user_info['uid'] = uid
user_info['following']= dom.xpath('//div[@class="meta-block"]/a/p/text()')[0]
user_info['follows'] = dom.xpath('//div[@class="meta-block"]/a/p/text()')[1]
user_info['article_nums'] = dom.xpath('//div[@class="meta-block"]/a/p/text()')[2]
user_info['word_nums'] = dom.xpath('//div[@class="meta-block"]/p/text()')[0]
user_info['like_nums'] = dom.xpath('//div[@class="meta-block"]/p/text()')[1]
return user_info

def getFollowsInfo(user_info):
print("==getFollowsInfo==")
follows = []
uid = user_info['uid']
follow_num = int(user_info['follows'])
PER_NUM = 9
max_page = int(follow_num / PER_NUM) if (follow_num % PER_NUM) == 0 else int(follow_num / PER_NUM)+1
following_urls = ['https://www.jianshu.com/users/{}/followers?page={}'.format(uid, i) for i in
range(1, max_page+1)]
for following_url in following_urls:
r = getResponse(following_url)
dom = etree.HTML(r.text)
items = dom.xpath('//ul/li//div[@class="info"]')
for item in items:
user = {}
try:
user['uid'] = item.xpath('./a/@href')[0].split('/')[2]
user['following'] = item.xpath('./div/span[1]/text()')[0].replace('关注', '').strip()
user['follows'] = item.xpath('./div/span[2]/text()')[0].replace('粉丝', '').strip()
user['article_nums'] = item.xpath('./div/span[3]/text()')[0].replace('文章', '').strip()
s = item.xpath('./div[2]/text()')[0]
num = re.findall(r"\d+",s)
if len(num) == 2:
user['word_nums'] = num[0]
user['like_nums'] = num[1]
follows.append(user)
except ValueError:
pass
return follows

if __name__ == "__main__":
uid = "67eb7ed414d3"
date = time.strftime("%Y-%m-%d", time.localtime())
user_info = getUserInfo(uid)
details = getArticleInfo(user_info)
follows = getFollowsInfo(user_info)
data = {'uid': uid, 'date': date, 'following': user_info['following'],
'follows': user_info['follows'], 'article_nums': user_info['article_nums'],
'word_nums': user_info['word_nums'], 'like_nums': user_info['like_nums'],
'article_details': details, 'follows_details': follows}

db = client.jianshu
posts = db.author_info
result = posts.insert_one(data)
print(result)

数据库用的是mongodb,让自行配置。

祝各位作者粉丝数暴增。

获取更多最新资讯,免费获取百G视频教程

请关注微信公众号:南强说晚安