diff --git a/.gitignore b/.gitignore index db4561e..f26d885 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,6 @@ docs/_build/ # PyBuilder target/ + +# Prevent accidental cookie leak +cookie diff --git a/README.rst b/README.rst index e4fb77f..9b5ad28 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,7 @@ zhihu-python:获取知乎信息 =============================== -:Author: `egrcc `_ ( `微博 `_ | `电邮 `_ ) -:Contributors: 参见 `Contributors `_ -:Update: 04/23 2016 - +**注意: 本项目不再维护更新!** .. contents:: @@ -875,11 +872,4 @@ zhihu.Post ---- 知乎专栏文章操作类 **Returns**:一个 Column 的实例对象 - -联系我 ----------- - -- 微博:http://weibo.com/u/2948739432 -- github:https://github.com/egrcc -- email:zhaolujun1994@gmail.com diff --git a/auth.py b/auth.py index adf6eaa..ed51290 100644 --- a/auth.py +++ b/auth.py @@ -4,6 +4,7 @@ # Build-in / Std import os, sys, time, platform, random import re, json, cookielib +from getpass import getpass # requirements import requests, termcolor @@ -67,7 +68,7 @@ def __init__(self, message): def download_captcha(): url = "https://www.zhihu.com/captcha.gif" - r = requests.get(url, params={"r": random.random(), "type": "login"} ) + r = requests.get(url, params={"r": random.random(), "type": "login"}, verify=False) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") image_name = u"verify." + r.headers['content-type'].split("/")[1] @@ -82,15 +83,7 @@ def download_captcha(): elif platform.system() == "Darwin": Logging.info(u"Command: open %s &" % image_name ) os.system("open %s &" % image_name ) - elif platform.system() == "SunOS": - os.system("open %s &" % image_name ) - elif platform.system() == "FreeBSD": - os.system("open %s &" % image_name ) - elif platform.system() == "Unix": - os.system("open %s &" % image_name ) - elif platform.system() == "OpenBSD": - os.system("open %s &" % image_name ) - elif platform.system() == "NetBSD": + elif platform.system() in ("SunOS", "FreeBSD", "Unix", "OpenBSD", "NetBSD"): os.system("open %s &" % image_name ) elif platform.system() == "Windows": os.system("%s" % image_name ) @@ -103,7 +96,7 @@ def download_captcha(): def search_xsrf(): url = "http://www.zhihu.com/" - r = requests.get(url) + r = requests.get(url, verify=False) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") results = re.compile(r"\ # 代表该用户提的所有问题的生成器对象 diff --git "a/text/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.txt" "b/text/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.txt" deleted file mode 100644 index b1e3921..0000000 --- "a/text/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.txt" +++ /dev/null @@ -1,21 +0,0 @@ -现实可以有多美好? - -作者: 邓岂 赞同: 23 - -最近高考失利啦,考的很差很差。高三辛苦弄到的加分全都没用上。 -心里想不开决定自杀。 -于是给已经有男朋友的她,发了一条长长的短信,告诉她我要去复读,请她以后不要再联系我了,把我忘了吧。其实现在的我也不再单身,有了一个女朋友。可我在决定去死的时候,选择的唯一一个告别的人却依然是她。 -那晚,我爬到了我家阳台上(24楼),准备一下子跳下去。 -可是她回了我的短信,大意是她明白我想说不仅仅是去复读,而是要做危险的事。她还说,她不舍得忘掉我。 -那一晚心情沉痛,关了手机,从家里骑了2个小时的自行车,到她家的楼下,看着她家的灯火一夜未息。直到天蒙蒙亮,我方才骑车回家。回家打开手机,112个未接电话,其中有98个来自她,另外来自我和她共同的朋友们。 -她们说,半夜收到她给她们发的短信,告诉她们她很担心我会做出很多出格的事情。 -我很偏执,但我看到、知道这一切的时候,我的心肺都融化了。 - -她拒绝过我。甚至我还亲自见证了她是怎样一步一步的爱上那个男生,并最终和她走到了一起。 -有人说,我是这世界上最傻最傻的备胎。可是我想告诉所有这样说的人,做这样一个备胎可是生命中最美好的事儿啊。 - -以上。 - - - -原链接: http://www.zhihu.com/question/24269892/answer/27338490 \ No newline at end of file diff --git a/zhihu.py b/zhihu.py index 222eaea..68af702 100755 --- a/zhihu.py +++ b/zhihu.py @@ -106,7 +106,12 @@ def __init__(self, url): self.slug = re.compile(r"(http|https)://zhuanlan.zhihu.com/p/(\d{8})").match(url).group(2) def parser(self): - r = requests.get('https://zhuanlan.zhihu.com/api/posts/' + self.slug) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "zhuanlan.zhihu.com", + 'Accept': "application/json, text/plain, */*" + } + r = requests.get('https://zhuanlan.zhihu.com/api/posts/' + self.slug, headers=headers, verify=False) self.meta = r.json() def get_title(self): @@ -188,7 +193,12 @@ def __init__(self, url, slug=None): self.slug = slug def parser(self): - r = requests.get('https://zhuanlan.zhihu.com/api/columns/' + self.slug) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "zhuanlan.zhihu.com", + 'Accept': "application/json, text/plain, */*" + } + r = requests.get('https://zhuanlan.zhihu.com/api/columns/' + self.slug, headers=headers, verify=False) self.meta = r.json() def get_title(self): @@ -256,7 +266,14 @@ def get_all_posts(self): for i in xrange((posts_num - 1) / 20 + 1): parm = {'limit': 20, 'offset': 20*i} url = 'https://zhuanlan.zhihu.com/api/columns/' + self.slug + '/posts' - r = requests.get(url, params=parm) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(url, params=parm, headers=headers, verify=False) posts_list = r.json() for p in posts_list: post_url = 'https://zhuanlan.zhihu.com/p/' + str(p['slug']) @@ -276,7 +293,14 @@ def __init__(self, url, title=None): if title != None: self.title = title def parser(self): - r = requests.get(self.url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.url,headers=headers, verify=False) self.soup = BeautifulSoup(r.content, "lxml") def get_title(self): @@ -417,7 +441,7 @@ def get_all_answers(self): 'Host': "www.zhihu.com", 'Referer': self.url } - r = requests.post(post_url, data=data, headers=header) + r = requests.post(post_url, data=data, headers=header, verify=False) answer_list = r.json()["msg"] for j in xrange(min(answers_num - i * 20, 20)): @@ -506,7 +530,14 @@ def __init__(self, user_url, user_id=None): self.user_id = user_id def parser(self): - r = requests.get(self.user_url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.user_url, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") self.soup = soup @@ -623,6 +654,22 @@ def get_followers_num(self): .find_all("a")[1].strong.string) return followers_num + def get_topics_num(self): + if self.user_url == None: + print "I'm anonymous user." + return 0 + else: + if self.soup == None: + self.parser() + soup = self.soup + topics_num = soup.find_all("div", class_="zm-profile-side-section-title")[-1].strong.string.encode("utf-8") + I='' + for i in topics_num: + if i.isdigit(): + I=I+i + topics_num=int(I) + return topics_num + def get_agree_num(self): if self.user_url == None: print "I'm anonymous user." @@ -690,7 +737,14 @@ def get_followees(self): yield else: followee_url = self.user_url + "/followees" - r = requests.get(followee_url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(followee_url, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") for i in xrange((followees_num - 1) / 20 + 1): @@ -715,7 +769,7 @@ def get_followees(self): 'Referer': followee_url } - r_post = requests.post(post_url, data=data, headers=header) + r_post = requests.post(post_url, data=data, headers=header, verify=False) followee_list = r_post.json()["msg"] for j in xrange(min(followees_num - i * 20, 20)): @@ -735,7 +789,14 @@ def get_followers(self): yield else: follower_url = self.user_url + "/followers" - r = requests.get(follower_url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(follower_url, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") for i in xrange((followers_num - 1) / 20 + 1): @@ -759,7 +820,7 @@ def get_followers(self): 'Host': "www.zhihu.com", 'Referer': follower_url } - r_post = requests.post(post_url, data=data, headers=header) + r_post = requests.post(post_url, data=data, headers=header, verify=False) follower_list = r_post.json()["msg"] for j in xrange(min(followers_num - i * 20, 20)): @@ -767,6 +828,55 @@ def get_followers(self): user_link = follower_soup.find("h2", class_="zm-list-content-title").a yield User(user_link["href"], user_link.string.encode("utf-8")) + def get_topics(self): + if self.user_url == None: + print "I'm anonymous user." + return + yield + else: + topics_num = self.get_topics_num() + # print topics_num + if topics_num == 0: + return + yield + else: + topics_url = self.user_url + "/topics" + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(topics_url, headers=headers, verify=False) + soup = BeautifulSoup(r.content, "lxml") + for i in xrange((topics_num - 1) / 20 + 1): + if i == 0: + topic_list = soup.find_all("div", class_="zm-profile-section-item zg-clear") + for j in xrange(min(topics_num, 20)): + yield topic_list[j].find("strong").string.encode("utf-8") + else: + post_url = topics_url + _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] + offset = i * 20 + data = { + '_xsrf': _xsrf, + 'offset': offset, + 'start': 0 + } + header = { + 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", + 'Host': "www.zhihu.com", + 'Referer': topics_url + } + r_post = requests.post(post_url, data=data, headers=header, verify=False) + + topic_data = r_post.json()["msg"][1] + topic_soup = BeautifulSoup(topic_data, "lxml") + topic_list = topic_soup.find_all("div", class_="zm-profile-section-item zg-clear") + for j in xrange(min(topics_num - i * 20, 20)): + yield topic_list[j].find("strong").string.encode("utf-8") + def get_asks(self): """ By ecsys (https://github.com/ecsys) @@ -785,7 +895,14 @@ def get_asks(self): else: for i in xrange((asks_num - 1) / 20 + 1): ask_url = self.user_url + "/asks?page=" + str(i + 1) - r = requests.get(ask_url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(ask_url, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") for question in soup.find_all("a", class_="question_link"): @@ -806,7 +923,14 @@ def get_answers(self): else: for i in xrange((answers_num - 1) / 20 + 1): answer_url = self.user_url + "/answers?page=" + str(i + 1) - r = requests.get(answer_url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(answer_url, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") for answer in soup.find_all("a", class_="question_link"): question_url = "http://www.zhihu.com" + answer["href"][0:18] @@ -827,8 +951,14 @@ def get_collections(self): else: for i in xrange((collections_num - 1) / 20 + 1): collection_url = self.user_url + "/collections?page=" + str(i + 1) - - r = requests.get(collection_url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(collection_url, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") for collection in soup.find_all("div", class_="zm-profile-section-item zg-clear"): @@ -845,7 +975,14 @@ def get_likes(self): return yield else: - r = requests.get(self.user_url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.user_url, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") # Handle the first liked item first_item = soup.find("div", attrs={'class':'zm-profile-section-item zm-item clearfix'}) @@ -866,7 +1003,7 @@ def get_likes(self): 'Referer': self.user_url, 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", } - r = requests.post(post_url, data=data, headers=header) + r = requests.post(post_url, data=data, headers=header, verify=False) response_size = r.json()["msg"][0] response_html = r.json()["msg"][1] while response_size > 0: @@ -886,7 +1023,7 @@ def get_likes(self): 'start': latest_data_time, '_xsrf': _xsrf, } - r = requests.post(post_url, data=data, headers=header) + r = requests.post(post_url, data=data, headers=header, verify=False) response_size = r.json()["msg"][0] response_html = r.json()["msg"][1] return @@ -912,7 +1049,14 @@ def __init__(self, answer_url, question=None, author=None, upvote=None, content= self.content = content def parser(self): - r = requests.get(self.answer_url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.answer_url, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") self.soup = soup @@ -1157,7 +1301,14 @@ def get_voters(self): # create_session() # s = session # r = s.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) - r = requests.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") voters_info = soup.find_all("span")[1:-1] if len(voters_info) == 0: @@ -1192,7 +1343,14 @@ def __init__(self, url, name=None, creator=None): if creator != None: self.creator = creator def parser(self): - r = requests.get(self.url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.url, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") self.soup = soup @@ -1257,7 +1415,14 @@ def get_all_answers(self): yield Answer(answer_url, question, author) i = 2 while True: - r = requests.get(self.url + "?page=" + str(i)) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.url + "?page=" + str(i), headers=headers, verify=False) answer_soup = BeautifulSoup(r.content, "lxml") answer_list = answer_soup.find_all("div", class_="zm-item") if len(answer_list) == 0: