From 6893172b3cf5f5ec513899608852e063b1e27de1 Mon Sep 17 00:00:00 2001 From: yannisxu Date: Thu, 17 Sep 2015 09:27:58 +0800 Subject: [PATCH 01/48] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E7=9F=A5=E4=B9=8E=20data-id=20=E7=9A=84=E6=96=B9=E6=B3=95?= =?UTF-8?q?=E6=9D=A5=E7=A1=AE=E5=AE=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zhihu.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/zhihu.py b/zhihu.py index 02c137f..9c75c35 100755 --- a/zhihu.py +++ b/zhihu.py @@ -365,6 +365,17 @@ def get_user_id(self): else: return user_id + def get_data_id(self): + if self.user_url == None: + print "I'm anonymous user." + return 0 + else: + if self.soup == None: + self.parser() + soup = self.soup + data_id = soup.find("button", class_="zg-btn zg-btn-follow zm-rich-follow-btn")['data-id'] + return data_id + def get_followees_num(self): if self.user_url == None: print "I'm anonymous user." From caa3aff7673319ac22704bc3fa0e5e84f9ecceba Mon Sep 17 00:00:00 2001 From: LuoZijun Date: Wed, 7 Oct 2015 01:47:45 +0800 Subject: [PATCH 02/48] Update auth.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 支持手机号码登录 --- auth.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/auth.py b/auth.py index a371196..76b82e5 100644 --- a/auth.py +++ b/auth.py @@ -112,8 +112,7 @@ def search_xsrf(): return results[0] def build_form(account, password): - account_type = "email" - if re.match(r"^\d{11}$", account): account_type = "phone" + if re.match(r"^1\d{10}$", account): account_type = "phone_num" elif re.match(r"^\S+\@\S+\.\S+$", account): account_type = "email" else: raise AccountError(u"帐号类型错误") @@ -124,7 +123,10 @@ def build_form(account, password): return form def upload_form(form): - url = "http://www.zhihu.com/login/email" + if "email" in form: url = "http://www.zhihu.com/login/email" + elif "phone_num" in form: url = "http://www.zhihu.com/login/phone_num" + else: raise ValueError(u"账号类型错误") + headers = { 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", 'Host': "www.zhihu.com", From 419335012cf4ae62976eba3a21ef0d90a8f588f0 Mon Sep 17 00:00:00 2001 From: Beiren Xie Date: Sat, 10 Oct 2015 17:27:48 +0800 Subject: [PATCH 03/48] fix bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 发现 get_content 失败,找了一下应该是空格的锅 --- zhihu.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/zhihu.py b/zhihu.py index 02c137f..f5149da 100755 --- a/zhihu.py +++ b/zhihu.py @@ -189,7 +189,7 @@ def get_all_answers(self): my_answer_count += 1 is_my_answer = True - if soup.find_all("div", class_="zm-item-answer")[j].find("div", class_=" zm-editable-content clearfix") == None: + if soup.find_all("div", class_="zm-item-answer")[j].find("div", class_="zm-editable-content clearfix") == None: error_answer_count += 1 continue author = None @@ -215,7 +215,7 @@ def get_all_answers(self): answer_url = "http://www.zhihu.com" + soup.find_all("a", class_="answer-date-link")[j]["href"] - answer = soup.find_all("div", class_=" zm-editable-content clearfix")[j - error_answer_count] + answer = soup.find_all("div", class_="zm-editable-content clearfix")[j - error_answer_count] soup.body.extract() soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'})) soup.body.append(answer) @@ -255,7 +255,7 @@ def get_all_answers(self): answer_soup = BeautifulSoup(answer_list[j]) - if answer_soup.find("div", class_=" zm-editable-content clearfix") == None: + if answer_soup.find("div", class_="zm-editable-content clearfix") == None: continue author = None @@ -281,7 +281,7 @@ def get_all_answers(self): answer_url = "http://www.zhihu.com" + answer_soup.find("a", class_="answer-date-link")["href"] - answer = answer_soup.find("div", class_=" zm-editable-content clearfix") + answer = answer_soup.find("div", class_="zm-editable-content clearfix") soup.body.extract() soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'})) soup.body.append(answer) @@ -674,7 +674,7 @@ def get_content(self): if self.soup == None: self.parser() soup = BeautifulSoup(self.soup.encode("utf-8")) - answer = soup.find("div", class_=" zm-editable-content clearfix") + answer = soup.find("div", class_="zm-editable-content clearfix") soup.body.extract() soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'})) soup.body.append(answer) From 935c6c57eb37cd241e4e267c42f64b56dc98c18a Mon Sep 17 00:00:00 2001 From: Ecsys Date: Wed, 14 Oct 2015 20:57:22 +0800 Subject: [PATCH 04/48] Added get_likes() function in User --- zhihu.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/zhihu.py b/zhihu.py index 02c137f..cab1720 100755 --- a/zhihu.py +++ b/zhihu.py @@ -599,6 +599,63 @@ def get_collections(self): yield Collection(url, name, self) + def get_likes(self): + # Todo: first version without zhuanlan article, also need the first one + if self.user_url == None: + print "I'm an anonymous user." + return + yield + else: + r = requests.get(self.user_url) + soup = BeautifulSoup(r.content) + # Handle the first liked item + first_item = soup.find("div", attrs={'class':'zm-profile-section-item zm-item clearfix'}) + first_item = first_item.find("div", attrs={'class':'zm-profile-section-main zm-profile-section-activity-main zm-profile-activity-page-item-main'}) + if u"赞同了回答" in str(first_item): + first_like = first_item.find("a")['href'] + yield Answer("http://www.zhihu.com" + first_like) + # Handle the rest liked items + post_url = self.user_url + "/activities" + start_time = soup.find("div", attrs={'class':'zm-profile-section-item zm-item clearfix'})["data-time"] + _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] + data = { + 'start': start_time, + '_xsrf': _xsrf, + } + header = { + 'Host': "www.zhihu.com", + 'Referer': self.user_url, + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", + } + r = requests.post(post_url, data=data, headers=header) + response_size = r.json()["msg"][0] + response_html = r.json()["msg"][1] + while response_size > 0: + all_liked_answers = re.findall(u"\u8d5e\u540c\u4e86\u56de\u7b54\n\n Date: Wed, 14 Oct 2015 21:05:41 +0800 Subject: [PATCH 05/48] Added get_likes() function in User --- zhihu.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/zhihu.py b/zhihu.py index cab1720..04cbb7e 100755 --- a/zhihu.py +++ b/zhihu.py @@ -600,7 +600,7 @@ def get_collections(self): def get_likes(self): - # Todo: first version without zhuanlan article, also need the first one + # This function only handles liked answers, not including zhuanlan articles if self.user_url == None: print "I'm an anonymous user." return @@ -632,7 +632,6 @@ def get_likes(self): response_html = r.json()["msg"][1] while response_size > 0: all_liked_answers = re.findall(u"\u8d5e\u540c\u4e86\u56de\u7b54\n\n Date: Sun, 1 Nov 2015 19:34:05 +0800 Subject: [PATCH 06/48] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=20=E9=97=AE=E9=A2=98?= =?UTF-8?q?=20#30?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修正问题 https://github.com/egrcc/zhihu-python/issues/30 中出现的 JSON 解析失败的问题。 --- auth.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/auth.py b/auth.py index a371196..b1e7c79 100644 --- a/auth.py +++ b/auth.py @@ -139,7 +139,14 @@ def upload_form(form): raise NetworkError(u"表单上传失败!") if r.headers['content-type'].lower() == "application/json": - result = r.json() + try: + # 修正 justkg 提出的问题: https://github.com/egrcc/zhihu-python/issues/30 + result = json.loads(r.content) + except Exception as e: + Logging.error(u"JSON解析失败!") + Logging.debug(e) + Logging.debug(r.content) + result = {} if result["r"] == 0: Logging.success(u"登录成功!" ) return {"result": True} From 2e6604794f2de0c1c40349cc8cff9206a235c810 Mon Sep 17 00:00:00 2001 From: egrcc Date: Wed, 16 Dec 2015 20:28:36 +0800 Subject: [PATCH 07/48] modify according to issue #34 --- auth.py | 12 +++--- zhihu.py | 118 +++++++++++++++++++++++++++---------------------------- 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/auth.py b/auth.py index 0828d41..8318353 100644 --- a/auth.py +++ b/auth.py @@ -135,7 +135,7 @@ def upload_form(form): 'Referer': "http://www.zhihu.com/", 'X-Requested-With': "XMLHttpRequest" } - + r = requests.post(url, data=form, headers=headers) if int(r.status_code) != 200: raise NetworkError(u"表单上传失败!") @@ -165,7 +165,7 @@ def upload_form(form): def islogin(): # check session - url = "http://www.zhihu.com/settings/profile" + url = "https://www.zhihu.com/settings/profile" r = requests.get(url, allow_redirects=False) status_code = int(r.status_code) if status_code == 301 or status_code == 302: @@ -179,7 +179,7 @@ def islogin(): def read_account_from_config_file(config_file="config.ini"): - # NOTE: The ConfigParser module has been renamed to configparser in Python 3. + # NOTE: The ConfigParser module has been renamed to configparser in Python 3. # The 2to3 tool will automatically adapt imports when converting your sources to Python 3. # https://docs.python.org/2/library/configparser.html from ConfigParser import ConfigParser @@ -198,8 +198,8 @@ def read_account_from_config_file(config_file="config.ini"): Logging.error(u"配置文件加载失败!") return (None, None) - - + + def login(account=None, password=None): if islogin() == True: @@ -215,7 +215,7 @@ def login(account=None, password=None): form_data = build_form(account, password) """ - result: + result: {"result": True} {"error": {"code": 19855555, "message": "unknow.", "data": "data" } } {"error": {"code": -1, "message": u"unknow error"} } diff --git a/zhihu.py b/zhihu.py index f5149da..f72be5b 100755 --- a/zhihu.py +++ b/zhihu.py @@ -1,52 +1,52 @@ # -*- coding: utf-8 -*- ''' - ;$$; - ############# - #############;#####o - ## o######################### + ;$$; + ############# + #############;#####o + ## o######################### ##### $############################### ## ###$ ######! ########################## ## ### $### ################### ###### ### ### ##o####################### ###### ;### #### ##################### - ## ### ###### ######&&################ - ## ### ###### ## ############ ####### - o## ######## ## ################## - ##o ### #### #######o####### - ## ###### ###########&##### - ## #### #############! - ### ######### - #####& ## o#### - ###### ## ####* - ## !## ##### - ## ##* ####; ## - ##### #####o ##### - #### ### ### $###o - ### ## ####! $### - ## ##### - ## ## - ;## ### ; - ##$ ## - ####### ## - ##### &## ## - ### ### ### - ### ### ## - ## ;## ## - ## ### ## - ### ### ## - #### ## - ### ## - ##; ## - ##$ ##& - ## ## - ##; ## - ## ##; - ### ### ##$ - ### ### ## - ###################### #####&&&&&&&&&&&&### - ### $#####$ ############&$o$&################################ - # $&########&o + ## ### ###### ######&&################ + ## ### ###### ## ############ ####### + o## ######## ## ################## + ##o ### #### #######o####### + ## ###### ###########&##### + ## #### #############! + ### ######### + #####& ## o#### + ###### ## ####* + ## !## ##### + ## ##* ####; ## + ##### #####o ##### + #### ### ### $###o + ### ## ####! $### + ## ##### + ## ## + ;## ### ; + ##$ ## + ####### ## + ##### &## ## + ### ### ### + ### ### ## + ## ;## ## + ## ### ## + ### ### ## + #### ## + ### ## + ##; ## + ##$ ##& + ## ## + ##; ## + ## ##; + ### ### ##$ + ### ### ## + ###################### #####&&&&&&&&&&&&### + ### $#####$ ############&$o$&################################ + # $&########&o ''' # Build-in / Std @@ -102,7 +102,7 @@ def __init__(self, url, title=None): raise ValueError("\"" + url + "\"" + " : it isn't a question url.") else: self.url = url - + if title != None: self.title = title def parser(self): @@ -188,16 +188,16 @@ def get_all_answers(self): if soup.find_all("div", class_="zm-item-answer")[j].find("span", class_="count") == None: my_answer_count += 1 is_my_answer = True - + if soup.find_all("div", class_="zm-item-answer")[j].find("div", class_="zm-editable-content clearfix") == None: error_answer_count += 1 continue author = None - if soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].string == u"匿名用户": + if soup.find_all("div", class_="zm-item-answer-author-info")[j].string == u"匿名用户": author_url = None author = User(author_url) else: - author_tag = soup.find_all("h3", class_="zm-item-answer-author-wrap")[j].find_all("a")[1] + author_tag = soup.find_all("div", class_="zm-item-answer-author-info")[j].find_all("a")[1] author_id = author_tag.string.encode("utf-8") author_url = "http://www.zhihu.com" + author_tag["href"] author = User(author_url, author_id) @@ -254,16 +254,16 @@ def get_all_answers(self): soup = BeautifulSoup(self.soup.encode("utf-8")) answer_soup = BeautifulSoup(answer_list[j]) - + if answer_soup.find("div", class_="zm-editable-content clearfix") == None: continue - + author = None - if answer_soup.find("h3", class_="zm-item-answer-author-wrap").string == u"匿名用户": + if answer_soup.find("div", class_="zm-item-answer-author-info").string == u"匿名用户": author_url = None author = User(author_url) else: - author_tag = answer_soup.find("h3", class_="zm-item-answer-author-wrap").find_all("a")[1] + author_tag = answer_soup.find("div", class_="zm-item-answer-author-info").find_all("a")[1] author_id = author_tag.string.encode("utf-8") author_url = "http://www.zhihu.com" + author_tag["href"] author = User(author_url, author_id) @@ -328,7 +328,7 @@ class User: def __init__(self, user_url, user_id=None): if user_url == None: self.user_id = "匿名用户" - elif user_url[0:28] != "http://www.zhihu.com/people/": + elif user_url.startswith('www.zhihu.com/people', user_url.index('//') + 2) == False: raise ValueError("\"" + user_url + "\"" + " : it isn't a user url.") else: self.user_url = user_url @@ -547,7 +547,7 @@ def get_asks(self): for i in xrange((asks_num - 1) / 20 + 1): ask_url = self.user_url + "/asks?page=" + str(i + 1) r = requests.get(ask_url) - + soup = BeautifulSoup(r.content) for question in soup.find_all("a", class_="question_link"): url = "http://www.zhihu.com" + question["href"] @@ -588,7 +588,7 @@ def get_collections(self): else: for i in xrange((collections_num - 1) / 20 + 1): collection_url = self.user_url + "/collections?page=" + str(i + 1) - + r = requests.get(collection_url) soup = BeautifulSoup(r.content) @@ -641,11 +641,11 @@ def get_author(self): if self.soup == None: self.parser() soup = self.soup - if soup.find("h3", class_="zm-item-answer-author-wrap").string == u"匿名用户": + if soup.find("div", class_="zm-item-answer-author-info").string == u"匿名用户": author_url = None author = User(author_url) else: - author_tag = soup.find("h3", class_="zm-item-answer-author-wrap").find_all("a")[1] + author_tag = soup.find("div", class_="zm-item-answer-author-info").find_all("a")[1] author_id = author_tag.string.encode("utf-8") author_url = "http://www.zhihu.com" + author_tag["href"] author = User(author_url, author_id) @@ -945,12 +945,12 @@ def get_all_answers(self): question = Question(question_url, question_title) answer_url = "http://www.zhihu.com" + answer.find("span", class_="answer-date-link-wrap").a["href"] author = None - - if answer.find("h3", class_="zm-item-answer-author-wrap").string == u"匿名用户": + + if answer.find("div", class_="zm-item-answer-author-info").string == u"匿名用户": author_url = None author = User(author_url) else: - author_tag = answer.find("h3", class_="zm-item-answer-author-wrap").find_all("a")[0] + author_tag = answer.find("div", class_="zm-item-answer-author-info").find_all("a")[0] author_id = author_tag.string.encode("utf-8") author_url = "http://www.zhihu.com" + author_tag["href"] author = User(author_url, author_id) @@ -973,12 +973,12 @@ def get_all_answers(self): answer_url = "http://www.zhihu.com" + answer.find("span", class_="answer-date-link-wrap").a[ "href"] author = None - if answer.find("h3", class_="zm-item-answer-author-wrap").string == u"匿名用户": + if answer.find("div", class_="zm-item-answer-author-info").string == u"匿名用户": # author_id = "匿名用户" author_url = None author = User(author_url) else: - author_tag = answer.find("h3", class_="zm-item-answer-author-wrap").find_all("a")[0] + author_tag = answer.find("div", class_="zm-item-answer-author-info").find_all("a")[0] author_id = author_tag.string.encode("utf-8") author_url = "http://www.zhihu.com" + author_tag["href"] author = User(author_url, author_id) From f6a3dc309b6b0fbf3381152429d55a4592b46dcb Mon Sep 17 00:00:00 2001 From: LuoZijun Date: Sat, 19 Dec 2015 11:28:00 +0800 Subject: [PATCH 08/48] Update auth.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修正不同平台编码问题 #30 --- auth.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/auth.py b/auth.py index 8318353..5e4fafd 100644 --- a/auth.py +++ b/auth.py @@ -96,8 +96,9 @@ def download_captcha(): os.system("open %s &" % image_name ) else: Logging.info(u"我们无法探测你的作业系统,请自行打开验证码 %s 文件,并输入验证码。" % os.path.join(os.getcwd(), image_name) ) - - captcha_code = raw_input( termcolor.colored("请输入验证码: ", "cyan") ) + + sys.stdout.write(termcolor.colored(u"请输入验证码: ", "cyan") ) + captcha_code = raw_input( ) return captcha_code def search_xsrf(): @@ -209,9 +210,10 @@ def login(account=None, password=None): if account == None: (account, password) = read_account_from_config_file() if account == None: - account = raw_input("请输入登录帐号: ") - password = raw_input("请输入登录密码: ") - + sys.stdout.write(u"请输入登录账号: ") + account = raw_input() + sys.stdout.write(u"请输入登录密码: ") + password = raw_input() form_data = build_form(account, password) """ From 08c64cda22886853f7d4a7dbfb5be6dded3f19f5 Mon Sep 17 00:00:00 2001 From: egrcc Date: Sat, 19 Dec 2015 12:33:27 +0800 Subject: [PATCH 09/48] add some annotations --- zhihu.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/zhihu.py b/zhihu.py index a961e0c..766f0f0 100755 --- a/zhihu.py +++ b/zhihu.py @@ -366,6 +366,11 @@ def get_user_id(self): return user_id def get_data_id(self): + """ + By yannisxu (https://github.com/yannisxu) + 增加获取知乎 data-id 的方法来确定标识用户的唯一性 #24 + (https://github.com/egrcc/zhihu-python/pull/24) + """ if self.user_url == None: print "I'm anonymous user." return 0 @@ -545,6 +550,11 @@ def get_followers(self): yield User(user_link["href"], user_link.string.encode("utf-8")) def get_asks(self): + """ + By ecsys (https://github.com/ecsys) + 增加了获取某用户所有赞过答案的功能 #29 + (https://github.com/egrcc/zhihu-python/pull/29) + """ if self.user_url == None: print "I'm anonymous user." return @@ -611,7 +621,7 @@ def get_collections(self): def get_likes(self): - # This function only handles liked answers, not including zhuanlan articles + # This function only handles liked answers, not including zhuanlan articles if self.user_url == None: print "I'm an anonymous user." return @@ -619,7 +629,7 @@ def get_likes(self): else: r = requests.get(self.user_url) soup = BeautifulSoup(r.content) - # Handle the first liked item + # Handle the first liked item first_item = soup.find("div", attrs={'class':'zm-profile-section-item zm-item clearfix'}) first_item = first_item.find("div", attrs={'class':'zm-profile-section-main zm-profile-section-activity-main zm-profile-activity-page-item-main'}) if u"赞同了回答" in str(first_item): @@ -635,7 +645,7 @@ def get_likes(self): } header = { 'Host': "www.zhihu.com", - 'Referer': self.user_url, + 'Referer': self.user_url, 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", } r = requests.post(post_url, data=data, headers=header) @@ -652,7 +662,7 @@ def get_likes(self): if len(data_times) != response_size: print "读取activities栏时间信息时发生错误,可能因为某答案中包含data-time信息" return - yield + yield latest_data_time = re.search(r"\d+", data_times[response_size - 1]).group() data = { 'start': latest_data_time, @@ -663,8 +673,8 @@ def get_likes(self): response_html = r.json()["msg"][1] return yield - - + + class Answer: answer_url = None From 7b8e42901b00bad26e2a46ac1817f313e94c62e6 Mon Sep 17 00:00:00 2001 From: frostming Date: Thu, 24 Dec 2015 15:00:31 +0800 Subject: [PATCH 10/48] =?UTF-8?q?=C3=A7=C2=BE=C2=8E=C3=A5=C2=8C=C2=96?= =?UTF-8?q?=C3=A4=C2=BA=C2=86markdown=C3=A6=C2=A0=C2=BC=C3=A5=C2=BC=C2=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zhihu.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/zhihu.py b/zhihu.py index 766f0f0..fef31ac 100755 --- a/zhihu.py +++ b/zhihu.py @@ -98,7 +98,7 @@ class Question: def __init__(self, url, title=None): - if url[0:len(url) - 8] != "http://www.zhihu.com/question/": + if not re.compile(r"(http|https)://www.zhihu.com/question/\d{8}").match(url): raise ValueError("\"" + url + "\"" + " : it isn't a question url.") else: self.url = url @@ -888,10 +888,10 @@ def to_md(self): f = open(os.path.join(os.path.join(os.getcwd(), "markdown"), file_name), "wt") f.write("# " + self.get_question().get_title() + "\n") if platform.system() == 'Windows': - f.write("## 作者: ".decode('utf-8').encode('gbk') + self.get_author().get_user_id() + " 赞同: ".decode( + f.write("### 作者: ".decode('utf-8').encode('gbk') + self.get_author().get_user_id() + " 赞同: ".decode( 'utf-8').encode('gbk') + str(self.get_upvote()) + "\n") else: - f.write("## 作者: " + self.get_author().get_user_id() + " 赞同: " + str(self.get_upvote()) + "\n") + f.write("### 作者: " + self.get_author().get_user_id() + " 赞同: " + str(self.get_upvote()) + "\n") text = html2text.html2text(content.decode('utf-8')).encode("utf-8") r = re.findall(r'\*\*(.*?)\*\*', text) @@ -910,11 +910,11 @@ def to_md(self): if platform.system() == 'Windows': f.write(text.decode('utf-8').encode('gbk')) - link_str = "#### 原链接: ".decode('utf-8').encode('gbk') + link_str = "\n---\n#### 原链接: ".decode('utf-8').encode('gbk') f.write(link_str + self.answer_url.decode('utf-8').encode('gbk')) else: f.write(text) - f.write("#### 原链接: " + self.answer_url) + f.write("\n---\n#### 原链接: " + self.answer_url) f.close() def get_visit_times(self): From 2f704fd0dd68860d98595557f8680ac7432b1f88 Mon Sep 17 00:00:00 2001 From: egrcc Date: Fri, 25 Dec 2015 00:41:16 +0800 Subject: [PATCH 11/48] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=20issue=20#36=20?= =?UTF-8?q?=E6=8F=90=E5=88=B0=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- auth.py | 4 ++-- zhihu.py | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/auth.py b/auth.py index 5e4fafd..d58edb0 100644 --- a/auth.py +++ b/auth.py @@ -93,10 +93,10 @@ def download_captcha(): elif platform.system() == "NetBSD": os.system("open %s &" % image_name ) elif platform.system() == "Windows": - os.system("open %s &" % image_name ) + os.system("%s" % image_name ) else: Logging.info(u"我们无法探测你的作业系统,请自行打开验证码 %s 文件,并输入验证码。" % os.path.join(os.getcwd(), image_name) ) - + sys.stdout.write(termcolor.colored(u"请输入验证码: ", "cyan") ) captcha_code = raw_input( ) return captcha_code diff --git a/zhihu.py b/zhihu.py index fef31ac..aed10a9 100755 --- a/zhihu.py +++ b/zhihu.py @@ -193,7 +193,7 @@ def get_all_answers(self): error_answer_count += 1 continue author = None - if soup.find_all("div", class_="zm-item-answer-author-info")[j].string == u"匿名用户": + if soup.find_all("div", class_="zm-item-answer-author-info")[j].get_text(strip='\n') == u"匿名用户": author_url = None author = User(author_url) else: @@ -259,7 +259,7 @@ def get_all_answers(self): continue author = None - if answer_soup.find("div", class_="zm-item-answer-author-info").string == u"匿名用户": + if answer_soup.find("div", class_="zm-item-answer-author-info").get_text(strip='\n') == u"匿名用户": author_url = None author = User(author_url) else: @@ -552,7 +552,7 @@ def get_followers(self): def get_asks(self): """ By ecsys (https://github.com/ecsys) - 增加了获取某用户所有赞过答案的功能 #29 + 增加了获取某用户所有赞过答案的功能 #29 (https://github.com/egrcc/zhihu-python/pull/29) """ if self.user_url == None: @@ -718,7 +718,7 @@ def get_author(self): if self.soup == None: self.parser() soup = self.soup - if soup.find("div", class_="zm-item-answer-author-info").string == u"匿名用户": + if soup.find("div", class_="zm-item-answer-author-info").get_text(strip='\n') == u"匿名用户": author_url = None author = User(author_url) else: @@ -959,7 +959,8 @@ class Collection: def __init__(self, url, name=None, creator=None): - if url[0:len(url) - 8] != "http://www.zhihu.com/collection/": + #if url[0:len(url) - 8] != "http://www.zhihu.com/collection/": + if not re.compile(r"(http|https)://www.zhihu.com/collection/\d{8}").match(url): raise ValueError("\"" + url + "\"" + " : it isn't a collection url.") else: self.url = url @@ -1023,7 +1024,7 @@ def get_all_answers(self): answer_url = "http://www.zhihu.com" + answer.find("span", class_="answer-date-link-wrap").a["href"] author = None - if answer.find("div", class_="zm-item-answer-author-info").string == u"匿名用户": + if answer.find("div", class_="zm-item-answer-author-info").get_text(strip='\n') == u"匿名用户": author_url = None author = User(author_url) else: @@ -1050,7 +1051,7 @@ def get_all_answers(self): answer_url = "http://www.zhihu.com" + answer.find("span", class_="answer-date-link-wrap").a[ "href"] author = None - if answer.find("div", class_="zm-item-answer-author-info").string == u"匿名用户": + if answer.find("div", class_="zm-item-answer-author-info").get_text(strip='\n') == u"匿名用户": # author_id = "匿名用户" author_url = None author = User(author_url) From 732b0471d55ef5db2bcab0731461412aa11313c2 Mon Sep 17 00:00:00 2001 From: Mukosame Date: Sat, 2 Jan 2016 16:27:05 +0800 Subject: [PATCH 12/48] add user::get_gender --- zhihu.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/zhihu.py b/zhihu.py index aed10a9..a103a88 100755 --- a/zhihu.py +++ b/zhihu.py @@ -381,6 +381,25 @@ def get_data_id(self): data_id = soup.find("button", class_="zg-btn zg-btn-follow zm-rich-follow-btn")['data-id'] return data_id + def get_gender(self): + """ + By Mukosame (https://github.com/mukosame) + 增加获取知乎识用户的性别 + + """ + if self.user_url == None: + print "I'm anonymous user." + return 0 + else: + if self.soup == None: + self.parser() + soup = self.soup + gender = str(soup.find("span",class_="item gender").i) + if (gender == ''): + return 'female' + else: + return 'male' + def get_followees_num(self): if self.user_url == None: print "I'm anonymous user." From c1239fcda305ceea6e946b00a63a95035ea79d17 Mon Sep 17 00:00:00 2001 From: Mukosame Date: Sat, 2 Jan 2016 17:20:46 +0800 Subject: [PATCH 13/48] add get_gender to readme and test --- README.rst | 8 ++++++++ test.py | 3 +++ 2 files changed, 11 insertions(+) diff --git a/README.rst b/README.rst index 9040ba1..69ff6f7 100644 --- a/README.rst +++ b/README.rst @@ -255,6 +255,8 @@ User 代表一个用户,处理用户相关操作。创建一个 User 对象需 user = User(user_url) # 获取用户ID user_id = user.get_user_id() + # 获取用户性别 + user_gender = user.get_gender() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 @@ -485,7 +487,13 @@ zhihu.User ---- 知乎用户操作类 得到该用户的ID。 **Returns**: 代表 ID 的字符串 + + **user.get_gender** () + 得到该用户的性别。 + + **Returns**: 代表 性别 的字符串(male/female) + **get_followees_num** () 得到该用户关注的人的个数。 diff --git a/test.py b/test.py index 2235fb0..13f2c0b 100755 --- a/test.py +++ b/test.py @@ -103,6 +103,8 @@ def user_test(user_url): user = User(user_url) # 获取用户ID user_id = user.get_user_id() + # 获取用户性别 + user_gender = user.get_gender() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 @@ -130,6 +132,7 @@ def user_test(user_url): collections = user.get_collections() print user_id # 黄继新 + print user_gender #male print followers_num # 614840 print followees_num # 8408 print asks_num # 1323 From b1a91e91df7ac07468bff40b34c9eb7ca6ade38f Mon Sep 17 00:00:00 2001 From: egrcc Date: Sat, 2 Jan 2016 19:01:21 +0800 Subject: [PATCH 14/48] modify readme --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 69ff6f7..c2d0f0a 100644 --- a/README.rst +++ b/README.rst @@ -488,7 +488,7 @@ zhihu.User ---- 知乎用户操作类 **Returns**: 代表 ID 的字符串 - **user.get_gender** () + **get_gender** () 得到该用户的性别。 From 20932df4b6c1090b8fafc85debdb9bf5ff815bfc Mon Sep 17 00:00:00 2001 From: egrcc Date: Tue, 5 Jan 2016 12:55:39 +0800 Subject: [PATCH 15/48] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=80=A7=E5=88=AB?= =?UTF-8?q?=E4=B8=8D=E5=8F=AF=E7=9F=A5=E7=9A=84=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zhihu.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/zhihu.py b/zhihu.py index a103a88..66628d3 100755 --- a/zhihu.py +++ b/zhihu.py @@ -385,20 +385,23 @@ def get_gender(self): """ By Mukosame (https://github.com/mukosame) 增加获取知乎识用户的性别 - - """ + + """ if self.user_url == None: print "I'm anonymous user." - return 0 + return 'unknown' else: if self.soup == None: self.parser() soup = self.soup - gender = str(soup.find("span",class_="item gender").i) - if (gender == ''): - return 'female' - else: - return 'male' + try: + gender = str(soup.find("span",class_="item gender").i) + if (gender == ''): + return 'female' + else: + return 'male' + except: + return 'unknown' def get_followees_num(self): if self.user_url == None: From 7f421aac09123365719e627792a8b572be1a6658 Mon Sep 17 00:00:00 2001 From: egrcc Date: Wed, 20 Jan 2016 22:03:50 +0800 Subject: [PATCH 16/48] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20issue=20#39=20?= =?UTF-8?q?=E4=B8=AD=E7=9A=84=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zhihu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zhihu.py b/zhihu.py index 66628d3..8d6a8b4 100755 --- a/zhihu.py +++ b/zhihu.py @@ -951,7 +951,7 @@ def get_voters(self): if self.soup == None: self.parser() soup = self.soup - data_aid = soup.find("div", class_="zm-item-answer ")["data-aid"] + data_aid = soup.find("div", class_="zm-item-answer zm-item-expanded")["data-aid"] request_url = 'http://www.zhihu.com/node/AnswerFullVoteInfoV2' # if session == None: # create_session() From 13b07d3af9d7d59e473a3308504cc9eecaf5f3ed Mon Sep 17 00:00:00 2001 From: egrcc Date: Wed, 20 Jan 2016 22:38:06 +0800 Subject: [PATCH 17/48] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20issue=20#39=20?= =?UTF-8?q?=E4=B8=AD=E7=9A=84=20bug2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zhihu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zhihu.py b/zhihu.py index 8d6a8b4..072dc08 100755 --- a/zhihu.py +++ b/zhihu.py @@ -965,7 +965,7 @@ def get_voters(self): yield else: for voter_info in voters_info: - if voter_info.string == ( u"匿名用户、" or u"匿名用户"): + if voter_info.string == u"匿名用户、" or voter_info.string == u"匿名用户": voter_url = None yield User(voter_url) else: From 16b229c3f3a19cece94fba3c4c57fe407b34fd12 Mon Sep 17 00:00:00 2001 From: egrcc Date: Wed, 20 Jan 2016 22:59:20 +0800 Subject: [PATCH 18/48] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20issue=20#41=20?= =?UTF-8?q?=E4=B8=AD=E7=9A=84=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zhihu.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/zhihu.py b/zhihu.py index 072dc08..1e9746a 100755 --- a/zhihu.py +++ b/zhihu.py @@ -819,6 +819,7 @@ def to_txt(self): # print file_name # else: # print file_name + file_name = file_name.replace("/", "'SLASH'") if os.path.exists(os.path.join(os.path.join(os.getcwd(), "text"), file_name)): f = open(os.path.join(os.path.join(os.getcwd(), "text"), file_name), "a") f.write("\n\n") @@ -839,6 +840,7 @@ def to_txt(self): # print file_name # else: # print file_name + file_name = file_name.replace("/", "'SLASH'") f = open(os.path.join(os.path.join(os.getcwd(), "text"), file_name), "wt") f.write(self.get_question().get_title() + "\n\n") if platform.system() == 'Windows': @@ -884,6 +886,7 @@ def to_md(self): # print file_name # else: # print file_name + file_name = file_name.replace("/", "'SLASH'") if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "markdown"))): os.makedirs(os.path.join(os.path.join(os.getcwd(), "markdown"))) if os.path.exists(os.path.join(os.path.join(os.getcwd(), "markdown"), file_name)): @@ -907,6 +910,7 @@ def to_md(self): # print file_name # else: # print file_name + file_name = file_name.replace("/", "'SLASH'") f = open(os.path.join(os.path.join(os.getcwd(), "markdown"), file_name), "wt") f.write("# " + self.get_question().get_title() + "\n") if platform.system() == 'Windows': From a98c731cdc26a380c23ccf995147f9e4a4c8b375 Mon Sep 17 00:00:00 2001 From: egrcc Date: Wed, 24 Feb 2016 17:23:23 +0800 Subject: [PATCH 19/48] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20issue=20#44=20?= =?UTF-8?q?=E4=B8=AD=E7=9A=84=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zhihu.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/zhihu.py b/zhihu.py index 1e9746a..600f04d 100755 --- a/zhihu.py +++ b/zhihu.py @@ -177,9 +177,9 @@ def get_all_answers(self): else: error_answer_count = 0 my_answer_count = 0 - for i in xrange((answers_num - 1) / 50 + 1): + for i in xrange((answers_num - 1) / 20 + 1): if i == 0: - for j in xrange(min(answers_num, 50)): + for j in xrange(min(answers_num, 20)): if self.soup == None: self.parser() soup = BeautifulSoup(self.soup.encode("utf-8")) @@ -234,9 +234,9 @@ def get_all_answers(self): else: post_url = "http://www.zhihu.com/node/QuestionAnswerListV2" _xsrf = self.soup.find("input", attrs={'name': '_xsrf'})["value"] - offset = i * 50 + offset = i * 20 params = json.dumps( - {"url_token": int(self.url[-8:-1] + self.url[-1]), "pagesize": 50, "offset": offset}) + {"url_token": int(self.url[-8:-1] + self.url[-1]), "pagesize": 20, "offset": offset}) data = { '_xsrf': _xsrf, 'method': "next", @@ -250,7 +250,7 @@ def get_all_answers(self): r = requests.post(post_url, data=data, headers=header) answer_list = r.json()["msg"] - for j in xrange(min(answers_num - i * 50, 50)): + for j in xrange(min(answers_num - i * 20, 20)): soup = BeautifulSoup(self.soup.encode("utf-8")) answer_soup = BeautifulSoup(answer_list[j]) From be26d760abf08e883ae5f1ba2a0a4e97f8a8500b Mon Sep 17 00:00:00 2001 From: egrcc Date: Tue, 22 Mar 2016 14:30:59 +0800 Subject: [PATCH 20/48] ali --- README.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.rst b/README.rst index c2d0f0a..163148d 100644 --- a/README.rst +++ b/README.rst @@ -676,3 +676,11 @@ zhihu.Collection ---- 知乎收藏夹操作类 - 微博:http://weibo.com/u/2948739432 - github:https://github.com/egrcc - email:zhaolujun1994@gmail.com + + +捐赠 +---------- + +如果本项目有帮到你,欢迎捐赠支持: + +.. image:: http://egrcc.github.io/img/alipay.jpg From 72cfe10f3d0553a4cd1b45713c8cbcda522508b8 Mon Sep 17 00:00:00 2001 From: egrcc Date: Tue, 22 Mar 2016 14:32:35 +0800 Subject: [PATCH 21/48] ali --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 163148d..509dc31 100644 --- a/README.rst +++ b/README.rst @@ -684,3 +684,4 @@ zhihu.Collection ---- 知乎收藏夹操作类 如果本项目有帮到你,欢迎捐赠支持: .. image:: http://egrcc.github.io/img/alipay.jpg + :scale: 50 % From f68b7c6d2ff5efe82532be99b1fcd3b20d577719 Mon Sep 17 00:00:00 2001 From: egrcc Date: Tue, 22 Mar 2016 14:34:47 +0800 Subject: [PATCH 22/48] ali --- README.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 509dc31..932b4cd 100644 --- a/README.rst +++ b/README.rst @@ -683,5 +683,7 @@ zhihu.Collection ---- 知乎收藏夹操作类 如果本项目有帮到你,欢迎捐赠支持: -.. image:: http://egrcc.github.io/img/alipay.jpg - :scale: 50 % +
+ +
+
From 6d0517e539e490b146773d1fafd3ea740339e899 Mon Sep 17 00:00:00 2001 From: egrcc Date: Tue, 22 Mar 2016 14:36:06 +0800 Subject: [PATCH 23/48] ali --- README.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 932b4cd..f39feaa 100644 --- a/README.rst +++ b/README.rst @@ -683,7 +683,7 @@ zhihu.Collection ---- 知乎收藏夹操作类 如果本项目有帮到你,欢迎捐赠支持: -
- -
-
+.. image:: http://egrcc.github.io/img/alipay.jpg + :height: 350 px + :width: 350 px + From fc25aa098096dc7c351c943af87a597acdbbd1fb Mon Sep 17 00:00:00 2001 From: egrcc Date: Tue, 22 Mar 2016 14:44:30 +0800 Subject: [PATCH 24/48] ali --- README.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index f39feaa..cffcb1f 100644 --- a/README.rst +++ b/README.rst @@ -683,7 +683,12 @@ zhihu.Collection ---- 知乎收藏夹操作类 如果本项目有帮到你,欢迎捐赠支持: + .. image:: http://egrcc.github.io/img/alipay.jpg - :height: 350 px - :width: 350 px + :height: 300px + :width: 300 px + :scale: 50 % + :alt: alternate text + :align: right + From 3c5b15090954b4b9ac378de51e1e6c2201801b03 Mon Sep 17 00:00:00 2001 From: egrcc Date: Tue, 22 Mar 2016 14:59:40 +0800 Subject: [PATCH 25/48] ali --- README.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index cffcb1f..fd41e0b 100644 --- a/README.rst +++ b/README.rst @@ -685,10 +685,8 @@ zhihu.Collection ---- 知乎收藏夹操作类 .. image:: http://egrcc.github.io/img/alipay.jpg - :height: 300px - :width: 300 px - :scale: 50 % - :alt: alternate text - :align: right + +.. image:: http://egrcc.github.io/img/wechat.jpg + From 10c46869d69d453c2ba0eb3dde6967da04a25571 Mon Sep 17 00:00:00 2001 From: egrcc Date: Tue, 22 Mar 2016 15:00:36 +0800 Subject: [PATCH 26/48] ali --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index fd41e0b..ec3a8d1 100644 --- a/README.rst +++ b/README.rst @@ -686,7 +686,7 @@ zhihu.Collection ---- 知乎收藏夹操作类 .. image:: http://egrcc.github.io/img/alipay.jpg -.. image:: http://egrcc.github.io/img/wechat.jpg +.. image:: http://egrcc.github.io/img/wechat.png From 99d62231cc84b537f3391b9e56062ebcf21ba4f3 Mon Sep 17 00:00:00 2001 From: egrcc Date: Tue, 22 Mar 2016 15:02:54 +0800 Subject: [PATCH 27/48] ali --- README.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index ec3a8d1..df51462 100644 --- a/README.rst +++ b/README.rst @@ -681,7 +681,9 @@ zhihu.Collection ---- 知乎收藏夹操作类 捐赠 ---------- -如果本项目有帮到你,欢迎捐赠支持: +如果本项目有帮到你,欢迎捐赠支持。 + +支付宝(左),微信支付(右): .. image:: http://egrcc.github.io/img/alipay.jpg From 0688d1ce72ea3a39eff1416c4812d82c70763e78 Mon Sep 17 00:00:00 2001 From: egrcc Date: Sat, 26 Mar 2016 13:31:58 +0800 Subject: [PATCH 28/48] go,php --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index df51462..9cc897b 100644 --- a/README.rst +++ b/README.rst @@ -12,9 +12,9 @@ zhihu-python:获取知乎信息 介绍 ---- -zhihu-python 采用 python2.7 编写,用来方便地获取知乎上各种内容的信息,并且可以方便地将答案备份导出为 txt 或 markdown 文件。由于知乎官方目前没有提供 api,所以有了此项目的存在。 +zhihu-python 采用 Python2.7 编写,用来方便地获取知乎上各种内容的信息,并且可以方便地将答案备份导出为 txt 或 markdown 文件。由于知乎官方目前没有提供 api,所以有了此项目的存在。 -使用 python3 的类似项目可以参见:`zhihu-py3 `_ 。 +使用 Python3 的类似项目可以参见:`zhihu-py3 `_ 。使用 PHP 的类似项目可以参见:`zhihu-php `_ 。使用 Go 的类似项目可以参见:`zhihu-go `_ 。 **注: 本项目代码均在 Ubuntu14.04 上使用 python2.7.6 编写和测试通过,其他环境可能存在一定问题。** From 1f2bdb1dd775287cc67ba2c3a557f3c65b778590 Mon Sep 17 00:00:00 2001 From: ttsxfwsy Date: Wed, 30 Mar 2016 22:51:57 +0800 Subject: [PATCH 29/48] change bs4 parser to lxml --- zhihu.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/zhihu.py b/zhihu.py index 600f04d..ff3f78d 100755 --- a/zhihu.py +++ b/zhihu.py @@ -107,7 +107,7 @@ def __init__(self, url, title=None): def parser(self): r = requests.get(self.url) - self.soup = BeautifulSoup(r.content) + self.soup = BeautifulSoup(r.content, "lxml") def get_title(self): if hasattr(self, "title"): @@ -182,7 +182,7 @@ def get_all_answers(self): for j in xrange(min(answers_num, 20)): if self.soup == None: self.parser() - soup = BeautifulSoup(self.soup.encode("utf-8")) + soup = BeautifulSoup(self.soup.encode("utf-8"), "lxml") is_my_answer = False if soup.find_all("div", class_="zm-item-answer")[j].find("span", class_="count") == None: @@ -251,9 +251,9 @@ def get_all_answers(self): answer_list = r.json()["msg"] for j in xrange(min(answers_num - i * 20, 20)): - soup = BeautifulSoup(self.soup.encode("utf-8")) + soup = BeautifulSoup(self.soup.encode("utf-8"), "lxml") - answer_soup = BeautifulSoup(answer_list[j]) + answer_soup = BeautifulSoup(answer_list[j], "lxml") if answer_soup.find("div", class_="zm-editable-content clearfix") == None: continue @@ -337,7 +337,7 @@ def __init__(self, user_url, user_id=None): def parser(self): r = requests.get(self.user_url) - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, "lxml") self.soup = soup def get_user_id(self): @@ -496,7 +496,7 @@ def get_followees(self): followee_url = self.user_url + "/followees" r = requests.get(followee_url) - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, "lxml") for i in xrange((followees_num - 1) / 20 + 1): if i == 0: user_url_list = soup.find_all("h2", class_="zm-list-content-title") @@ -523,7 +523,7 @@ def get_followees(self): followee_list = r_post.json()["msg"] for j in xrange(min(followees_num - i * 20, 20)): - followee_soup = BeautifulSoup(followee_list[j]) + followee_soup = BeautifulSoup(followee_list[j], "lxml") user_link = followee_soup.find("h2", class_="zm-list-content-title").a yield User(user_link["href"], user_link.string.encode("utf-8")) @@ -541,7 +541,7 @@ def get_followers(self): follower_url = self.user_url + "/followers" r = requests.get(follower_url) - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, "lxml") for i in xrange((followers_num - 1) / 20 + 1): if i == 0: user_url_list = soup.find_all("h2", class_="zm-list-content-title") @@ -567,7 +567,7 @@ def get_followers(self): follower_list = r_post.json()["msg"] for j in xrange(min(followers_num - i * 20, 20)): - follower_soup = BeautifulSoup(follower_list[j]) + follower_soup = BeautifulSoup(follower_list[j], "lxml") user_link = follower_soup.find("h2", class_="zm-list-content-title").a yield User(user_link["href"], user_link.string.encode("utf-8")) @@ -591,7 +591,7 @@ def get_asks(self): ask_url = self.user_url + "/asks?page=" + str(i + 1) r = requests.get(ask_url) - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, "lxml") for question in soup.find_all("a", class_="question_link"): url = "http://www.zhihu.com" + question["href"] title = question.string.encode("utf-8") @@ -611,7 +611,7 @@ def get_answers(self): for i in xrange((answers_num - 1) / 20 + 1): answer_url = self.user_url + "/answers?page=" + str(i + 1) r = requests.get(answer_url) - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, "lxml") for answer in soup.find_all("a", class_="question_link"): question_url = "http://www.zhihu.com" + answer["href"][0:18] question_title = answer.string.encode("utf-8") @@ -634,7 +634,7 @@ def get_collections(self): r = requests.get(collection_url) - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, "lxml") for collection in soup.find_all("div", class_="zm-profile-section-item zg-clear"): url = "http://www.zhihu.com" + \ collection.find("a", class_="zm-profile-fav-item-title")["href"] @@ -650,7 +650,7 @@ def get_likes(self): yield else: r = requests.get(self.user_url) - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, "lxml") # Handle the first liked item first_item = soup.find("div", attrs={'class':'zm-profile-section-item zm-item clearfix'}) first_item = first_item.find("div", attrs={'class':'zm-profile-section-main zm-profile-section-activity-main zm-profile-activity-page-item-main'}) @@ -717,7 +717,7 @@ def __init__(self, answer_url, question=None, author=None, upvote=None, content= def parser(self): r = requests.get(self.answer_url) - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, "lxml") self.soup = soup def get_question(self): @@ -772,7 +772,7 @@ def get_content(self): else: if self.soup == None: self.parser() - soup = BeautifulSoup(self.soup.encode("utf-8")) + soup = BeautifulSoup(self.soup.encode("utf-8"), "lxml") answer = soup.find("div", class_="zm-editable-content clearfix") soup.body.extract() soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'})) @@ -962,7 +962,7 @@ def get_voters(self): # s = session # r = s.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) r = requests.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, "lxml") voters_info = soup.find_all("span")[1:-1] if len(voters_info) == 0: return @@ -997,7 +997,7 @@ def __init__(self, url, name=None, creator=None): self.creator = creator def parser(self): r = requests.get(self.url) - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, "lxml") self.soup = soup def get_name(self): @@ -1062,7 +1062,7 @@ def get_all_answers(self): i = 2 while True: r = requests.get(self.url + "?page=" + str(i)) - answer_soup = BeautifulSoup(r.content) + answer_soup = BeautifulSoup(r.content, "lxml") answer_list = answer_soup.find_all("div", class_="zm-item") if len(answer_list) == 0: break From 26622231a3782b36fa9c098b40c367e2139b2bed Mon Sep 17 00:00:00 2001 From: ttsxfwsy Date: Wed, 30 Mar 2016 23:01:40 +0800 Subject: [PATCH 30/48] Update README.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加 lxml 依赖 --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 9cc897b..9a07f50 100644 --- a/README.rst +++ b/README.rst @@ -78,7 +78,7 @@ zhihu-python 采用 Python2.7 编写,用来方便地获取知乎上各种内 * `requests `_ * `html2text `_ * `termcolor `_ - +* `lxml `_ .. code:: bash From 494afb1707128058d8e5736f11888a24916bea1d Mon Sep 17 00:00:00 2001 From: egrcc Date: Sat, 16 Apr 2016 14:42:59 +0800 Subject: [PATCH 31/48] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E9=AA=8C=E8=AF=81?= =?UTF-8?q?=E7=A0=81=E9=94=99=E8=AF=AF=E7=9A=84=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- auth.py | 13 ++++++++----- requirements.txt | 3 ++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/auth.py b/auth.py index d58edb0..dac0e87 100644 --- a/auth.py +++ b/auth.py @@ -66,8 +66,8 @@ def __init__(self, message): def download_captcha(): - url = "http://www.zhihu.com/captcha.gif" - r = requests.get(url, params={"r": random.random()} ) + url = "https://www.zhihu.com/captcha.gif" + r = requests.get(url, params={"r": random.random(), "type": "login"} ) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") image_name = u"verify." + r.headers['content-type'].split("/")[1] @@ -124,9 +124,12 @@ def build_form(account, password): return form def upload_form(form): - if "email" in form: url = "http://www.zhihu.com/login/email" - elif "phone_num" in form: url = "http://www.zhihu.com/login/phone_num" - else: raise ValueError(u"账号类型错误") + if "email" in form: + url = "https://www.zhihu.com/login/email" + elif "phone_num" in form: + url = "https://www.zhihu.com/login/phone_num" + else: + raise ValueError(u"账号类型错误") headers = { 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", diff --git a/requirements.txt b/requirements.txt index 0ca5f5e..1bcec2a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ requests beautifulsoup4 html2text -termcolor \ No newline at end of file +termcolor +lxml From e2839072434c8524792733d23af31fc86926883f Mon Sep 17 00:00:00 2001 From: egrcc Date: Sat, 16 Apr 2016 18:17:07 +0800 Subject: [PATCH 32/48] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20issue=20#53=20?= =?UTF-8?q?=E4=B8=AD=E7=9A=84=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- auth.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/auth.py b/auth.py index dac0e87..adf6eaa 100644 --- a/auth.py +++ b/auth.py @@ -161,7 +161,7 @@ def upload_form(form): return {"error": {"code": int(result['errcode']), "message": result['msg'], "data": result['data'] } } else: Logging.warn(u"表单上传出现未知错误: \n \t %s )" % ( str(result) ) ) - return {"error": {"code": -1, "message": u"unknow error"} } + return {"error": {"code": -1, "message": u"unknown error"} } else: Logging.warn(u"无法解析服务器的响应内容: \n \t %s " % r.text ) return {"error": {"code": -2, "message": u"parse error"} } @@ -222,8 +222,8 @@ def login(account=None, password=None): """ result: {"result": True} - {"error": {"code": 19855555, "message": "unknow.", "data": "data" } } - {"error": {"code": -1, "message": u"unknow error"} } + {"error": {"code": 19855555, "message": "unknown.", "data": "data" } } + {"error": {"code": -1, "message": u"unknown error"} } """ result = upload_form(form_data) if "error" in result: @@ -231,8 +231,12 @@ def login(account=None, password=None): # 验证码错误 Logging.error(u"验证码输入错误,请准备重新输入。" ) return login() + elif result["error"]['code'] == 100005: + # 密码错误 + Logging.error(u"密码输入错误,请准备重新输入。" ) + return login() else: - Logging.warn(u"unknow error." ) + Logging.warn(u"unknown error." ) return False elif "result" in result and result['result'] == True: # 登录成功 From 7c07235f084c55180a6f9f193b5e8e7aedc950c4 Mon Sep 17 00:00:00 2001 From: Yufeng Bai Date: Wed, 20 Apr 2016 21:18:54 +0800 Subject: [PATCH 33/48] Add new feature Post and Column for zhihu --- zhihu.py | 168 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) diff --git a/zhihu.py b/zhihu.py index ff3f78d..5db7fcb 100755 --- a/zhihu.py +++ b/zhihu.py @@ -92,6 +92,174 @@ reload(sys) sys.setdefaultencoding('utf8') +class Post: + url = None + meta = None + slug = None + + def __init__(self, url): + + if not re.compile(r"(http|https)://zhuanlan.zhihu.com/p/\d{8}").match(url): + raise ValueError("\"" + url + "\"" + " : it isn't a question url.") + else: + self.url = url + self.slug = re.compile(r"(http|https)://zhuanlan.zhihu.com/p/(\d{8})").match(url).group(2) + + def parser(self): + r = requests.get('https://zhuanlan.zhihu.com/api/posts/' + self.slug) + self.meta = r.json() + + def get_title(self): + if hasattr(self, "title"): + if platform.system() == 'Windows': + title = self.title.decode('utf-8').encode('gbk') + return title + else: + return self.title + else: + if self.meta == None: + self.parser() + meta = self.meta + title = meta['title'] + self.title = title + if platform.system() == 'Windows': + title = title.decode('utf-8').encode('gbk') + return title + else: + return title + + def get_content(self): + if self.meta == None: + self.parser() + meta = self.meta + content = meta['content'] + if platform.system() == 'Windows': + content = content.decode('utf-8').encode('gbk') + return content + else: + return content + + def get_author(self): + if hasattr(self, "author"): + return self.author + else: + if self.meta == None: + self.parser() + meta = self.meta + author_tag = meta['author'] + author = User(author_tag['profileUrl'],author_tag['slug']) + return author + + def get_column(self): + if self.meta == None: + self.parser() + meta = self.meta + column_url = 'https://zhuanlan.zhihu.com/' + meta['column']['slug'] + return Column(column_url, meta['column']['slug']) + + def get_likes(self): + if self.meta == None: + self.parser() + meta = self.meta + return int(meta["likesCount"]) + + def get_topics(self): + if self.meta == None: + self.parser() + meta = self.meta + for topic in meta['topics']: + yield topic + +class Column: + url = None + meta = None + + def __init__(self, url, slug=None): + + if not re.compile(r"(http|https)://zhuanlan.zhihu.com/([0-9a-zA-Z]+)").match(url): + raise ValueError("\"" + url + "\"" + " : it isn't a question url.") + else: + self.url = url + if slug == None: + self.slug = re.compile(r"(http|https)://zhuanlan.zhihu.com/([0-9a-zA-Z]+)").match(url).group(2) + else: + self.slug = slug + + def parser(self): + r = requests.get('https://zhuanlan.zhihu.com/api/columns/' + self.slug) + self.meta = r.json() + + def get_title(self): + if hasattr(self,"title"): + if platform.system() == 'Windows': + title = self.title.decode('utf-8').encode('gbk') + return title + else: + return self.title + else: + if self.meta == None: + self.parser() + meta = self.meta + title = meta['name'] + self.title = title + if platform.system() == 'Windows': + title = title.decode('utf-8').encode('gbk') + return title + else: + return title + + def get_description(self): + if self.meta == None: + self.parser() + meta = self.meta + description = meta['description'] + if platform.system() == 'Windows': + description = description.decode('utf-8').encode('gbk') + return description + else: + return description + + def get_followers_num(self): + if self.meta == None: + self.parser() + meta = self.meta + followers_num = int(meta['followersCount']) + return followers_num + + def get_posts_num(self): + if self.meta == None: + self.parser() + meta = self.meta + posts_num = int(meta['postsCount']) + return posts_num + + def get_creator(self): + if hasattr(self, "creator"): + return self.creator + else: + if self.meta == None: + self.parser() + meta = self.meta + creator_tag = meta['creator'] + creator = User(creator_tag['profileUrl'],creator_tag['slug']) + return creator + + def get_all_posts(self): + posts_num = self.get_posts_num() + if posts_num == 0: + print "No posts." + return + yield + else: + for i in xrange((posts_num - 1) / 20 + 1): + parm = {'limit': 20, 'offset': 20*i} + url = 'https://zhuanlan.zhihu.com/api/columns/' + self.slug + '/posts' + r = requests.get(url, params=parm) + posts_list = r.json() + for p in posts_list: + post_url = 'https://zhuanlan.zhihu.com/p/' + str(p['slug']) + yield Post(post_url) + class Question: url = None soup = None From f915be20850c5fb744d7c29319d3bb9b91074514 Mon Sep 17 00:00:00 2001 From: Yufeng Bai Date: Sat, 23 Apr 2016 11:32:40 +0800 Subject: [PATCH 34/48] Add documentation and test for Post and Column, and fix bug in fetching topics in post --- README.rst | 191 +++++++++++++++++++++++++++++++++++++++++++++++++++++ test.py | 56 ++++++++++++++++ zhihu.py | 4 +- 3 files changed, 250 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 9a07f50..f2eb165 100644 --- a/README.rst +++ b/README.rst @@ -351,7 +351,101 @@ Collection 代表一个收藏夹,处理收藏夹相关操作。创建一个 Co print answers # # 代表所有答案的生成器对象 + + +Post:获取知乎文章 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Post 代表一个文章, Post 对象需传入该文章的 url ,如: + +.. code-block:: python + + from zhihu import Post + + url = "http://zhuanlan.zhihu.com/p/20235601" + post = Post(url) + +得到 Post 对象后,可以获取该文章的一些信息: + +.. code-block:: python + + # -*- coding: utf-8 -*- + from zhihu import Post + + url = "http://zhuanlan.zhihu.com/p/20770968" + post = Post(url) + + # 获取该文章的标题 + title = post.get_title() + # 获取该文章的内容 + content = post.get_content() + # 获取该文章的作者 + author = post.get_author() + # 获取该文章的所属专栏 + column = post.get_column() + # 获取该文章所属话题 + topics = post.get_topics() + + print title # 输出:夜读书|四月十九日 + print content + # 输出: + #

各位,晚上好。
... + # ...... + print author + # 输出: + for topic in topics: + print topic, # 输出:阅读 + print column + # 输出: + # Column类对象 + +Column:获取知乎专栏 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Column 代表一个专栏 Column 对象需传入该专栏 url ,如: + +.. code-block:: python + + from zhihu import Column + url = "http://zhuanlan.zhihu.com/daily" + column = Column(url) + +得到 Column 对象后,可以获取该专栏的一些信息: + +.. code-block:: python + + # -*- coding: utf-8 -*- + from zhihu import Column + + url = "http://zhuanlan.zhihu.com/daily" + column = Column(url) + + # 获取该专栏的标题 + title = column.get_title() + # 获取该专栏的描述 + description = column.get_description() + # 获取该专栏的作者 + creator = column.get_creator() + # 获取该专栏的文章数 + posts_num = column.get_posts_num() + # 获取该专栏的所有文章 + posts = column.get_all_posts() + + print title # 输出:知乎日报 + print description + # 输出: + # 知乎日报启动画面接受所有摄影师朋友们的投稿,将作品链接(如 Flickr、LOFTER 等等),发至邮箱 qidong (at) zhihu.com,并附上您的知乎个人页面地址即可。 + # + # 详细投稿要求: http://t.cn/zQyEpN5 + + print creator + # 输出: + # User类对象 + print posts_num # 150 + print posts + # 输出: + # Post类对象 综合实例 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -669,7 +763,104 @@ zhihu.Collection ---- 知乎收藏夹操作类 **Returns**: 包含该收藏夹下前 n 个回答的 generator 对象。其中每一个元素为代表一个回答的 Answer 对象 +zhihu.Column ---- 知乎专栏操作类 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +*class* zhihu. **Column** (*Column_url*) + + Column 以 url 为唯一标识,创建一个 Column 对象实例必须传入一个代表知乎专栏的 url (如:http://zhuanlan.zhihu.com/daily),需包含“http(s)://”。如果传入的不是代表专栏的 url ,程序会报错。通过调用 Column 类的一系列方法,获得该专栏的一些信息。 + + **Parameters**: + * **column_url** -- 该专栏的链接,字符串 + + **Returns**: 一个 Column 实例对象 + + **get_title** () + + 得到该专栏的题目。 + + **Returns**: 一个代表题目的字符串 + + **get_creator** () + + 得到该专栏的创建者 。 + + **Returns**: 一个 User 对象 + + **get_description** () + + 得到该专栏的描述 + + **Returns**: 一个专栏描述的字符串 + + **get_followers_num** () + + 得到该专栏的关注人数 + + **Returns**: 一个 int 型的整数 + + **get_posts_num** () + + 得到该专栏的所有文章数 + + **Returns**: 一个 int 型的整数 + + **get_content** () + + 得到该答案的内容。 + + **Returns**: 一个字符串 + + **get_posts** () + + 得到该专栏的所有文章 + + **Returns**:包含所有文章的 generator 对象。其中每一个元素为代表一个文章 Post 对象 + + +zhihu.Post ---- 知乎文章操作类 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +*class* zhihu. **Post** (*Post_url*) + + Post 以 url 为唯一标识,创建一个 Post 对象实例必须传入一个代表知乎文章的 url (如:http://zhuanlan.zhihu.com/p/20235601),需包含“http(s)://”。如果传入的不是代表文章的 url ,程序会报错。通过调用 Post 类的一系列方法,获得该文章的一些信息。 + + **Parameters**: + * **post_url** -- 该文章的链接,字符串 + + **Returns**: 一个 Post 实例对象 + + **get_title** () + + 得到该文章的题目。 + + **Returns**: 一个代表题目的字符串 + + **get_author** () + + 得到该文章的作者 。 + + **Returns**: 一个 User 对象 + + **get_content** () + + 得到该文章的内容 + + **Returns**: 一个文章描述的字符串 + **get_topics** () + + 得到该文章的话题。 + + **Returns**: 一个列表 + + **get_column** () + + 得到该文章的所属专栏 + + **Returns**:一个 Column 的实例对象 + + 联系我 ---------- diff --git a/test.py b/test.py index 13f2c0b..e07c7a8 100755 --- a/test.py +++ b/test.py @@ -25,6 +25,8 @@ from zhihu import Answer from zhihu import User from zhihu import Collection +from zhihu import Post +from zhihu import Column def question_test(url): @@ -197,6 +199,56 @@ def collection_test(collection_url): # 代表所有答案的生成器对象 +def post_test(post_url): + post = Post(post_url) + + # 获取该文章的标题 + title = post.get_title() + # 获取该文章的内容 + content = post.get_content() + # 获取该文章的作者 + author = post.get_author() + # 获取该文章的所属专栏 + column = post.get_column() + # 获取该文章所属话题 + topics = post.get_topics() + + print title # 输出: + print content + for topic in topics: + print topic, # 输出: + print "\n" + print author + # 输出: + # User类对象 + print column + # 输出: + # Column类对象 + + +def column_test(column_url): + + column = Column(column_url) + + # 获取该专栏的标题 + title = column.get_title() + # 获取该专栏的描述 + description = column.get_description() + # 获取该专栏的作者 + creator = column.get_creator() + # 获取该专栏的文章数 + posts_num = column.get_posts_num() + # 获取该专栏的所有文章 + posts = column.get_all_posts() + + print title + print description + print creator + # 输出: + # User类对象 + print posts_num + print posts + def test(): url = "http://www.zhihu.com/question/24269892" question = Question(url) @@ -231,6 +283,10 @@ def main(): user_test(user_url) collection_url = "http://www.zhihu.com/collection/36750683" collection_test(collection_url) + post_url = "http://zhuanlan.zhihu.com/p/20770968" + post_test(post_url) + column_url = "http://zhuanlan.zhihu.com/daily" + column_test(column_url) test() diff --git a/zhihu.py b/zhihu.py index 5db7fcb..3c309f1 100755 --- a/zhihu.py +++ b/zhihu.py @@ -167,8 +167,10 @@ def get_topics(self): if self.meta == None: self.parser() meta = self.meta + topic_list = [] for topic in meta['topics']: - yield topic + topic_list.append(topic['name']) + return topic_list class Column: url = None From bbc302973b61a62f7417ed8b432a299fd35b2595 Mon Sep 17 00:00:00 2001 From: liuwons Date: Sat, 23 Apr 2016 13:16:51 +0800 Subject: [PATCH 35/48] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E7=94=A8=E6=88=B7=E5=A4=B4=E5=83=8F=E9=93=BE=E6=8E=A5=E7=9A=84?= =?UTF-8?q?=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test.py | 3 +++ zhihu.py | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/test.py b/test.py index 13f2c0b..9728bdb 100755 --- a/test.py +++ b/test.py @@ -119,6 +119,8 @@ def user_test(user_url): agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() + # 获取该用户的头像url + head_img_url = user.get_head_img_url() # 获取该用户关注的人 followees = user.get_followees() @@ -140,6 +142,7 @@ def user_test(user_url): print collections_num # 44 print agree_num # 46387 print thanks_num # 11477 + print head_img_url # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg print followees # diff --git a/zhihu.py b/zhihu.py index ff3f78d..e6231f7 100755 --- a/zhihu.py +++ b/zhihu.py @@ -365,6 +365,32 @@ def get_user_id(self): else: return user_id + def get_head_img_url(self, scale=4): + """ + By liuwons (https://github.com/liuwons) + 增加获取知乎识用户的头像url + scale对应的头像尺寸: + 1 - 25×25 + 3 - 75×75 + 4 - 100×100 + 6 - 150×150 + 10 - 250×250 + """ + scale_list = [1, 3, 4, 6, 10] + scale_name = '0s0ml0t000b' + if self.user_url == None: + print "I'm anonymous user." + return None + else: + if scale not in scale_list: + print 'Illegal scale.' + return None + if self.soup == None: + self.parser() + soup = self.soup + url = soup.find("img", class_="Avatar Avatar--l")["src"] + return url[:-5] + scale_name[scale] + url[-4:] + def get_data_id(self): """ By yannisxu (https://github.com/yannisxu) From 42fbce65833047283eee5a99e793f75a8039df3f Mon Sep 17 00:00:00 2001 From: liuwons Date: Sat, 23 Apr 2016 13:36:33 +0800 Subject: [PATCH 36/48] =?UTF-8?q?=E4=BF=AE=E6=94=B9README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.rst b/README.rst index 9a07f50..d4179c2 100644 --- a/README.rst +++ b/README.rst @@ -271,6 +271,8 @@ User 代表一个用户,处理用户相关操作。创建一个 User 对象需 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() + # 获取该用户的头像url + head_img_url = user.get_head_img_url() # 获取该用户关注的人 followees = user.get_followees() @@ -291,6 +293,7 @@ User 代表一个用户,处理用户相关操作。创建一个 User 对象需 print collections_num # 44 print agree_num # 46387 print thanks_num # 11477 + print head_img_url # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg print followees # @@ -517,6 +520,14 @@ zhihu.User ---- 知乎用户操作类 得到该用户获得的感谢数。 **Returns**: 代表感谢数的 int 型整数 + + **get_head_img_url** (scale) + + 获取用户头像url。 + + **Parameters**: **scale** int 型整数,代表尺寸: 1(25×25), 3(75×75), 4(100×100), 6(150×150), 10(250×250) + + **Returns**: 对应尺寸头像的图片链接, 字符串 **get_asks_num** () From 931fe0c3e9833155e887f09eb1a6a7dc2e0fc14b Mon Sep 17 00:00:00 2001 From: egrcc Date: Mon, 25 Apr 2016 01:20:53 +0800 Subject: [PATCH 37/48] readme --- README.rst | 131 ++++++++++++++++++++++++----------------------------- 1 file changed, 59 insertions(+), 72 deletions(-) diff --git a/README.rst b/README.rst index f2eb165..92de105 100644 --- a/README.rst +++ b/README.rst @@ -2,8 +2,8 @@ zhihu-python:获取知乎信息 =============================== :Author: `egrcc `_ ( `微博 `_ | `电邮 `_ ) -:Committer: `Eureka22 `_ , `lufo816 `_ , `LuoZijun `_ -:Update: 09/09 2015 +:Contributors: 参见 `Contributors `_ +:Update: 04/23 2016 .. contents:: @@ -353,56 +353,10 @@ Collection 代表一个收藏夹,处理收藏夹相关操作。创建一个 Co # 代表所有答案的生成器对象 -Post:获取知乎文章 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Post 代表一个文章, Post 对象需传入该文章的 url ,如: - -.. code-block:: python - - from zhihu import Post - - url = "http://zhuanlan.zhihu.com/p/20235601" - post = Post(url) - -得到 Post 对象后,可以获取该文章的一些信息: - -.. code-block:: python - - # -*- coding: utf-8 -*- - from zhihu import Post - - url = "http://zhuanlan.zhihu.com/p/20770968" - post = Post(url) - - # 获取该文章的标题 - title = post.get_title() - # 获取该文章的内容 - content = post.get_content() - # 获取该文章的作者 - author = post.get_author() - # 获取该文章的所属专栏 - column = post.get_column() - # 获取该文章所属话题 - topics = post.get_topics() - - print title # 输出:夜读书|四月十九日 - print content - # 输出: - #

各位,晚上好。
... - # ...... - print author - # 输出: - for topic in topics: - print topic, # 输出:阅读 - print column - # 输出: - # Column类对象 - Column:获取知乎专栏 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Column 代表一个专栏 Column 对象需传入该专栏 url ,如: +Column 代表一个专栏,创建一个 Column 对象需传入该专栏的 url ,如: .. code-block:: python @@ -447,6 +401,54 @@ Column 代表一个专栏 Column 对象需传入该专栏 url ,如: # 输出: # Post类对象 + +Post:获取知乎文章 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Post 代表一个文章,创建一个 Post 对象需传入该文章的 url ,如: + +.. code-block:: python + + from zhihu import Post + + url = "http://zhuanlan.zhihu.com/p/20235601" + post = Post(url) + +得到 Post 对象后,可以获取该文章的一些信息: + +.. code-block:: python + + # -*- coding: utf-8 -*- + from zhihu import Post + + url = "http://zhuanlan.zhihu.com/p/20770968" + post = Post(url) + + # 获取该文章的标题 + title = post.get_title() + # 获取该文章的内容 + content = post.get_content() + # 获取该文章的作者 + author = post.get_author() + # 获取该文章的所属专栏 + column = post.get_column() + # 获取该文章所属话题 + topics = post.get_topics() + + print title # 输出:夜读书|四月十九日 + print content + # 输出: + #

各位,晚上好。
... + # ...... + print author + # 输出: + for topic in topics: + print topic, # 输出:阅读 + print column + # 输出: + # Column类对象 + + 综合实例 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -783,25 +785,25 @@ zhihu.Column ---- 知乎专栏操作类 **get_creator** () - 得到该专栏的创建者 。 + 得到该专栏的创建者。 **Returns**: 一个 User 对象 **get_description** () - 得到该专栏的描述 + 得到该专栏的描述。 **Returns**: 一个专栏描述的字符串 **get_followers_num** () - 得到该专栏的关注人数 + 得到该专栏的关注人数。 **Returns**: 一个 int 型的整数 **get_posts_num** () - 得到该专栏的所有文章数 + 得到该专栏的所有文章数。 **Returns**: 一个 int 型的整数 @@ -813,7 +815,7 @@ zhihu.Column ---- 知乎专栏操作类 **get_posts** () - 得到该专栏的所有文章 + 得到该专栏的所有文章。 **Returns**:包含所有文章的 generator 对象。其中每一个元素为代表一个文章 Post 对象 @@ -838,13 +840,13 @@ zhihu.Post ---- 知乎文章操作类 **get_author** () - 得到该文章的作者 。 + 得到该文章的作者。 **Returns**: 一个 User 对象 **get_content** () - 得到该文章的内容 + 得到该文章的内容。 **Returns**: 一个文章描述的字符串 @@ -856,7 +858,7 @@ zhihu.Post ---- 知乎文章操作类 **get_column** () - 得到该文章的所属专栏 + 得到该文章的所属专栏。 **Returns**:一个 Column 的实例对象 @@ -868,18 +870,3 @@ zhihu.Post ---- 知乎文章操作类 - github:https://github.com/egrcc - email:zhaolujun1994@gmail.com - -捐赠 ----------- - -如果本项目有帮到你,欢迎捐赠支持。 - -支付宝(左),微信支付(右): - - -.. image:: http://egrcc.github.io/img/alipay.jpg - -.. image:: http://egrcc.github.io/img/wechat.png - - - From 8cebda396f975abe783684807f11998725005a15 Mon Sep 17 00:00:00 2001 From: egrcc Date: Mon, 25 Apr 2016 01:31:23 +0800 Subject: [PATCH 38/48] readme --- README.rst | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 92de105..b2dfcb4 100644 --- a/README.rst +++ b/README.rst @@ -353,10 +353,10 @@ Collection 代表一个收藏夹,处理收藏夹相关操作。创建一个 Co # 代表所有答案的生成器对象 -Column:获取知乎专栏 +Column:获取专栏信息 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Column 代表一个专栏,创建一个 Column 对象需传入该专栏的 url ,如: +Column 代表一个专栏,处理专栏相关操作。创建一个 Column 对象需传入该专栏的 url ,如: .. code-block:: python @@ -389,7 +389,9 @@ Column 代表一个专栏,创建一个 Column 对象需传入该专栏的 url print title # 输出:知乎日报 print description # 输出: - # 知乎日报启动画面接受所有摄影师朋友们的投稿,将作品链接(如 Flickr、LOFTER 等等),发至邮箱 qidong (at) zhihu.com,并附上您的知乎个人页面地址即可。 + # 知乎日报启动画面接受所有摄影师朋友们的投稿,将作品链接 + #(如 Flickr、LOFTER 等等),发至邮箱 qidong (at) zhihu.com, + # 并附上您的知乎个人页面地址即可。 # # 详细投稿要求: http://t.cn/zQyEpN5 @@ -402,10 +404,10 @@ Column 代表一个专栏,创建一个 Column 对象需传入该专栏的 url # Post类对象 -Post:获取知乎文章 +Post:获取专栏文章信息 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Post 代表一个文章,创建一个 Post 对象需传入该文章的 url ,如: +Post 代表一个专栏文章,处理专栏文章相关操作。创建一个 Post 对象需传入该文章的 url ,如: .. code-block:: python @@ -820,7 +822,7 @@ zhihu.Column ---- 知乎专栏操作类 **Returns**:包含所有文章的 generator 对象。其中每一个元素为代表一个文章 Post 对象 -zhihu.Post ---- 知乎文章操作类 +zhihu.Post ---- 知乎专栏文章操作类 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ *class* zhihu. **Post** (*Post_url*) From c61034d24ae33a9fc25df70d46121097e82fce38 Mon Sep 17 00:00:00 2001 From: egrcc Date: Mon, 25 Apr 2016 01:36:05 +0800 Subject: [PATCH 39/48] readme --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index b2dfcb4..7638e6b 100644 --- a/README.rst +++ b/README.rst @@ -772,7 +772,7 @@ zhihu.Column ---- 知乎专栏操作类 *class* zhihu. **Column** (*Column_url*) - Column 以 url 为唯一标识,创建一个 Column 对象实例必须传入一个代表知乎专栏的 url (如:http://zhuanlan.zhihu.com/daily),需包含“http(s)://”。如果传入的不是代表专栏的 url ,程序会报错。通过调用 Column 类的一系列方法,获得该专栏的一些信息。 + Column 以 url 为唯一标识,创建一个 Column 对象实例必须传入一个代表知乎专栏的 url (如:http://zhuanlan.zhihu.com/daily),需包含“http(s)://”。如果传入的不是代表专栏的 url ,程序会报错。通过调用 Column 类的一系列方法,获得该专栏的一些信息。该类由 `@johnnyluck `_ 添加。 **Parameters**: * **column_url** -- 该专栏的链接,字符串 @@ -827,7 +827,7 @@ zhihu.Post ---- 知乎专栏文章操作类 *class* zhihu. **Post** (*Post_url*) - Post 以 url 为唯一标识,创建一个 Post 对象实例必须传入一个代表知乎文章的 url (如:http://zhuanlan.zhihu.com/p/20235601),需包含“http(s)://”。如果传入的不是代表文章的 url ,程序会报错。通过调用 Post 类的一系列方法,获得该文章的一些信息。 + Post 以 url 为唯一标识,创建一个 Post 对象实例必须传入一个代表知乎文章的 url (如:http://zhuanlan.zhihu.com/p/20235601),需包含“http(s)://”。如果传入的不是代表文章的 url ,程序会报错。通过调用 Post 类的一系列方法,获得该文章的一些信息。该类由 `@johnnyluck `_ 添加。 **Parameters**: * **post_url** -- 该文章的链接,字符串 From c480d6930d96c46c4db12d14a68e6f30845bb6f4 Mon Sep 17 00:00:00 2001 From: Lujun Zhao Date: Mon, 25 Apr 2016 14:22:53 +0800 Subject: [PATCH 40/48] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index f578e82..e4fb77f 100644 --- a/README.rst +++ b/README.rst @@ -621,7 +621,7 @@ zhihu.User ---- 知乎用户操作类 **get_head_img_url** (scale) - 获取用户头像url。 + 获取用户头像url。该方法由 `@liuwons `_ 添加。 **Parameters**: **scale** int 型整数,代表尺寸: 1(25×25), 3(75×75), 4(100×100), 6(150×150), 10(250×250) From b6e01323bb82fb22a6a6f9c6e52b6c8814442154 Mon Sep 17 00:00:00 2001 From: egrcc Date: Tue, 17 May 2016 11:59:05 +0800 Subject: [PATCH 41/48] _ --- ...02\347\232\204\345\233\236\347\255\224.md" | 19 ----------------- ...2\347\232\204\345\233\236\347\255\224.txt" | 21 ------------------- 2 files changed, 40 deletions(-) delete mode 100644 "markdown/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.md" delete mode 100644 "text/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.txt" diff --git "a/markdown/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.md" "b/markdown/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.md" deleted file mode 100644 index f058555..0000000 --- "a/markdown/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.md" +++ /dev/null @@ -1,19 +0,0 @@ -# 现实可以有多美好? -## 作者: 邓岂 赞同: 23 -最近高考失利啦,考的很差很差。高三辛苦弄到的加分全都没用上。 -心里想不开决定自杀。 -于是给已经有男朋友的她,发了一条长长的短信,告诉她我要去复读,请她以后不要再联系我了,把我忘了吧。其实现在的我也不再单身,有了一个女朋友。可我在决定去死的时 -候,选择的唯一一个告别的人却依然是她。 -那晚,我爬到了我家阳台上(24楼),准备一下子跳下去。 -可是她回了我的短信,大意是她明白我想说不仅仅是去复读,而是要做危险的事。她还说,她不舍得忘掉我。 -那一晚心情沉痛,关了手机,从家里骑了2个小时的自行车,到她家的楼下,看着她家的灯火一夜未息。直到天蒙蒙亮,我方才骑车回家。回家打开手机,112个未接电话,其 -中有98个来自她,另外来自我和她共同的朋友们。 -她们说,半夜收到她给她们发的短信,告诉她们她很担心我会做出很多出格的事情。 -我很偏执,但我看到、知道这一切的时候,我的心肺都融化了。 - -她拒绝过我。甚至我还亲自见证了她是怎样一步一步的爱上那个男生,并最终和她走到了一起。 -有人说,我是这世界上最傻最傻的备胎。可是我想告诉所有这样说的人,做这样一个备胎可是生命中最美好的事儿啊。 - -以上。 - -#### 原链接: http://www.zhihu.com/question/24269892/answer/27338490 \ No newline at end of file diff --git "a/text/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.txt" "b/text/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.txt" deleted file mode 100644 index b1e3921..0000000 --- "a/text/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237/\347\216\260\345\256\236\345\217\257\344\273\245\346\234\211\345\244\232\347\276\216\345\245\275\357\274\237--\351\202\223\345\262\202\347\232\204\345\233\236\347\255\224.txt" +++ /dev/null @@ -1,21 +0,0 @@ -现实可以有多美好? - -作者: 邓岂 赞同: 23 - -最近高考失利啦,考的很差很差。高三辛苦弄到的加分全都没用上。 -心里想不开决定自杀。 -于是给已经有男朋友的她,发了一条长长的短信,告诉她我要去复读,请她以后不要再联系我了,把我忘了吧。其实现在的我也不再单身,有了一个女朋友。可我在决定去死的时候,选择的唯一一个告别的人却依然是她。 -那晚,我爬到了我家阳台上(24楼),准备一下子跳下去。 -可是她回了我的短信,大意是她明白我想说不仅仅是去复读,而是要做危险的事。她还说,她不舍得忘掉我。 -那一晚心情沉痛,关了手机,从家里骑了2个小时的自行车,到她家的楼下,看着她家的灯火一夜未息。直到天蒙蒙亮,我方才骑车回家。回家打开手机,112个未接电话,其中有98个来自她,另外来自我和她共同的朋友们。 -她们说,半夜收到她给她们发的短信,告诉她们她很担心我会做出很多出格的事情。 -我很偏执,但我看到、知道这一切的时候,我的心肺都融化了。 - -她拒绝过我。甚至我还亲自见证了她是怎样一步一步的爱上那个男生,并最终和她走到了一起。 -有人说,我是这世界上最傻最傻的备胎。可是我想告诉所有这样说的人,做这样一个备胎可是生命中最美好的事儿啊。 - -以上。 - - - -原链接: http://www.zhihu.com/question/24269892/answer/27338490 \ No newline at end of file From 26f33cfe90f108b4cb2c11af217087649a451eea Mon Sep 17 00:00:00 2001 From: Enaunimes Date: Wed, 18 May 2016 00:04:41 +0800 Subject: [PATCH 42/48] =?UTF-8?q?=E7=AE=80=E5=8C=96=E5=B9=B3=E5=8F=B0?= =?UTF-8?q?=E5=88=A4=E6=96=AD=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- auth.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/auth.py b/auth.py index 2856c4e..ec4b4f0 100644 --- a/auth.py +++ b/auth.py @@ -82,15 +82,7 @@ def download_captcha(): elif platform.system() == "Darwin": Logging.info(u"Command: open %s &" % image_name ) os.system("open %s &" % image_name ) - elif platform.system() == "SunOS": - os.system("open %s &" % image_name ) - elif platform.system() == "FreeBSD": - os.system("open %s &" % image_name ) - elif platform.system() == "Unix": - os.system("open %s &" % image_name ) - elif platform.system() == "OpenBSD": - os.system("open %s &" % image_name ) - elif platform.system() == "NetBSD": + elif platform.system() in ("SunOS", "FreeBSD", "Unix", "OpenBSD", "NetBSD"): os.system("open %s &" % image_name ) elif platform.system() == "Windows": os.system("%s" % image_name ) From 9afc0b7981ac3b058904be70b9ea83f9266cc051 Mon Sep 17 00:00:00 2001 From: Enaunimes Date: Wed, 18 May 2016 00:19:04 +0800 Subject: [PATCH 43/48] =?UTF-8?q?=E4=BD=BF=E7=94=A8=20getpass.getpass=20?= =?UTF-8?q?=E8=8E=B7=E5=8F=96=E5=AF=86=E7=A0=81=EF=BC=8C=E9=81=BF=E5=85=8D?= =?UTF-8?q?=E8=BE=93=E5=85=A5=E6=97=B6=E5=9B=9E=E6=98=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://docs.python.org/2/library/getpass.html --- auth.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auth.py b/auth.py index ec4b4f0..ed51290 100644 --- a/auth.py +++ b/auth.py @@ -4,6 +4,7 @@ # Build-in / Std import os, sys, time, platform, random import re, json, cookielib +from getpass import getpass # requirements import requests, termcolor @@ -207,8 +208,7 @@ def login(account=None, password=None): if account == None: sys.stdout.write(u"请输入登录账号: ") account = raw_input() - sys.stdout.write(u"请输入登录密码: ") - password = raw_input() + password = getpass("请输入登录密码: ") form_data = build_form(account, password) """ From 0bd093b9a9751a81dcc824ba0e5ee1e5340ff51e Mon Sep 17 00:00:00 2001 From: egrcc Date: Wed, 18 May 2016 01:11:34 +0800 Subject: [PATCH 44/48] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E7=99=BB=E5=BD=95=20bu?= =?UTF-8?q?g?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- auth.py | 8 ++-- zhihu.py | 142 +++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 125 insertions(+), 25 deletions(-) diff --git a/auth.py b/auth.py index adf6eaa..2856c4e 100644 --- a/auth.py +++ b/auth.py @@ -67,7 +67,7 @@ def __init__(self, message): def download_captcha(): url = "https://www.zhihu.com/captcha.gif" - r = requests.get(url, params={"r": random.random(), "type": "login"} ) + r = requests.get(url, params={"r": random.random(), "type": "login"}, verify=False) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") image_name = u"verify." + r.headers['content-type'].split("/")[1] @@ -103,7 +103,7 @@ def download_captcha(): def search_xsrf(): url = "http://www.zhihu.com/" - r = requests.get(url) + r = requests.get(url, verify=False) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") results = re.compile(r"\ 0: @@ -886,7 +958,7 @@ def get_likes(self): 'start': latest_data_time, '_xsrf': _xsrf, } - r = requests.post(post_url, data=data, headers=header) + r = requests.post(post_url, data=data, headers=header, verify=False) response_size = r.json()["msg"][0] response_html = r.json()["msg"][1] return @@ -912,7 +984,14 @@ def __init__(self, answer_url, question=None, author=None, upvote=None, content= self.content = content def parser(self): - r = requests.get(self.answer_url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.answer_url, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") self.soup = soup @@ -1157,7 +1236,14 @@ def get_voters(self): # create_session() # s = session # r = s.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) - r = requests.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") voters_info = soup.find_all("span")[1:-1] if len(voters_info) == 0: @@ -1192,7 +1278,14 @@ def __init__(self, url, name=None, creator=None): if creator != None: self.creator = creator def parser(self): - r = requests.get(self.url) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.url, headers=headers, verify=False) soup = BeautifulSoup(r.content, "lxml") self.soup = soup @@ -1257,7 +1350,14 @@ def get_all_answers(self): yield Answer(answer_url, question, author) i = 2 while True: - r = requests.get(self.url + "?page=" + str(i)) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(self.url + "?page=" + str(i), headers=headers, verify=False) answer_soup = BeautifulSoup(r.content, "lxml") answer_list = answer_soup.find_all("div", class_="zm-item") if len(answer_list) == 0: From d029066af0508b4d3cf13d5fbaeeac35467fe07e Mon Sep 17 00:00:00 2001 From: Enaunimes Date: Wed, 18 May 2016 01:32:28 +0800 Subject: [PATCH 45/48] =?UTF-8?q?=E9=98=B2=E6=AD=A2=E6=84=8F=E5=A4=96?= =?UTF-8?q?=E6=B3=84=E6=BC=8Fcookie?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index db4561e..f26d885 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,6 @@ docs/_build/ # PyBuilder target/ + +# Prevent accidental cookie leak +cookie From 2b433646f0afa6f0f3cddd9bf337e8b7510771e8 Mon Sep 17 00:00:00 2001 From: egrcc Date: Wed, 18 May 2016 16:25:05 +0800 Subject: [PATCH 46/48] topics --- test.py | 4 ++++ zhihu.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/test.py b/test.py index c7a9ceb..9e06300 100755 --- a/test.py +++ b/test.py @@ -128,6 +128,7 @@ def user_test(user_url): followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() + topics = user.get_topics() # 获取该用户提的问题 asks = user.get_asks() # 获取该用户回答的问题的答案 @@ -166,6 +167,9 @@ def user_test(user_url): if i == 41: break + for topic in topics: + print topic + print asks # # 代表该用户提的所有问题的生成器对象 diff --git a/zhihu.py b/zhihu.py index 7046f87..e00e939 100755 --- a/zhihu.py +++ b/zhihu.py @@ -654,6 +654,22 @@ def get_followers_num(self): .find_all("a")[1].strong.string) return followers_num + def get_topics_num(self): + if self.user_url == None: + print "I'm anonymous user." + return 0 + else: + if self.soup == None: + self.parser() + soup = self.soup + topics_num = soup.find_all("div", class_="zm-profile-side-section-title")[1].strong.string.encode("utf-8") + I='' + for i in topics_num: + if i.isdigit(): + I=I+i + topics_num=int(I) + return topics_num + def get_agree_num(self): if self.user_url == None: print "I'm anonymous user." @@ -812,6 +828,55 @@ def get_followers(self): user_link = follower_soup.find("h2", class_="zm-list-content-title").a yield User(user_link["href"], user_link.string.encode("utf-8")) + def get_topics(self): + if self.user_url == None: + print "I'm anonymous user." + return + yield + else: + topics_num = self.get_topics_num() + # print topics_num + if topics_num == 0: + return + yield + else: + topics_url = self.user_url + "/topics" + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/" + } + r = requests.get(topics_url, headers=headers, verify=False) + soup = BeautifulSoup(r.content, "lxml") + for i in xrange((topics_num - 1) / 20 + 1): + if i == 0: + topic_list = soup.find_all("div", class_="zm-profile-section-item zg-clear") + for j in xrange(min(topics_num, 20)): + yield topic_list[j].find("strong").string.encode("utf-8") + else: + post_url = topics_url + _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] + offset = i * 20 + data = { + '_xsrf': _xsrf, + 'offset': offset, + 'start': 0 + } + header = { + 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", + 'Host': "www.zhihu.com", + 'Referer': topics_url + } + r_post = requests.post(post_url, data=data, headers=header, verify=False) + + topic_data = r_post.json()["msg"][1] + topic_soup = BeautifulSoup(topic_data, "lxml") + topic_list = topic_soup.find_all("div", class_="zm-profile-section-item zg-clear") + for j in xrange(min(topics_num - i * 20, 20)): + yield topic_list[j].find("strong").string.encode("utf-8") + def get_asks(self): """ By ecsys (https://github.com/ecsys) From 49bed784d99e6fe489ea0630601168f0b47eebbc Mon Sep 17 00:00:00 2001 From: egrcc Date: Wed, 18 May 2016 16:45:27 +0800 Subject: [PATCH 47/48] topic-num --- zhihu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zhihu.py b/zhihu.py index e00e939..68af702 100755 --- a/zhihu.py +++ b/zhihu.py @@ -662,7 +662,7 @@ def get_topics_num(self): if self.soup == None: self.parser() soup = self.soup - topics_num = soup.find_all("div", class_="zm-profile-side-section-title")[1].strong.string.encode("utf-8") + topics_num = soup.find_all("div", class_="zm-profile-side-section-title")[-1].strong.string.encode("utf-8") I='' for i in topics_num: if i.isdigit(): From 1e24d4dfa960eacddb566f00269eb3d1878a4e00 Mon Sep 17 00:00:00 2001 From: egrcc Date: Mon, 18 Jul 2016 19:32:34 +0800 Subject: [PATCH 48/48] deprecated --- README.rst | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/README.rst b/README.rst index e4fb77f..9b5ad28 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,7 @@ zhihu-python:获取知乎信息 =============================== -:Author: `egrcc `_ ( `微博 `_ | `电邮 `_ ) -:Contributors: 参见 `Contributors `_ -:Update: 04/23 2016 - +**注意: 本项目不再维护更新!** .. contents:: @@ -875,11 +872,4 @@ zhihu.Post ---- 知乎专栏文章操作类 **Returns**:一个 Column 的实例对象 - -联系我 ----------- - -- 微博:http://weibo.com/u/2948739432 -- github:https://github.com/egrcc -- email:zhaolujun1994@gmail.com