#!/usr/bin/env python import os import sys from pyquery import PyQuery as pq import html2text class Lintcode(object): def __init__(self): self.url_algo = "http://www.lintcode.com/en/problem/" def get_src_page(self, src_url): src_page = pq(url=src_url) return src_page def get_src_detail(self, src_url): src_page = self.get_src_page(src_url) src_detail = {} problem_detail = src_page('#problem-detail') difficulty_level = problem_detail('h4')('.label').text() title = problem_detail('h4')('.m-l-sm').text() raw_tags = problem_detail('#tags')('a') tags = [tag.text for tag in raw_tags] raw_detail = src_page('#problem-detail')('div').html() body_start = raw_detail.find('

') body_end = raw_detail.find('Tags') raw_body = raw_detail[body_start:body_end] body = raw_body.replace('', '

') body = body.replace('', '

') src_detail['title'] = title src_detail['tags'] = tags src_detail['level'] = difficulty_level src_detail['body'] = body return src_detail class Leetcode(object): def __init__(self): self.url_algo = "https://leetcode.com/problemset/algorithms/" def get_src_page(self, src_url): return pq(url=src_url) def get_src_detail(self, src_url): src_page = self.get_src_page(src_url) src_detail = {} problem_detail = src_page('.question-content') raw_detail = problem_detail.html() body_start = raw_detail.find('

') body_end = raw_detail.find('

') raw_body = raw_detail[body_start:body_end] body = raw_body.replace('', '

') body = body.replace('', '

') src_detail['body'] = body return src_detail def get_src_tags(src_page): raw_tags = src_page('.btn.btn-xs.btn-primary') return [tag.text() for tag in raw_tags.items()] def get_src_title(src_page): raw_title = src_page('title').text().split('|')[0][:-1] def get_difficulty(src_page): search_url = "https://leetcode.com/problemset/algorithms/" class Hihocoder(object): def __init__(self): self.url_algo = "http://hihocoder.com/contest/mstest2015april/problems" def get_src_page(self, src_url): return pq(filename=src_url) def get_src_title(self, src_page): raw_title = src_page('h3.panel-title').text() # Title begins after ': ' start_index = raw_title.find(': ') return raw_title[start_index + 2:] def get_src_detail(self, src_url): src_page = self.get_src_page(src_url) src_detail = {} raw_detail = src_page('#tl-problem-content').html() title = self.get_src_title(src_page) body = raw_detail src_detail['title'] = title src_detail['body'] = body return src_detail def main(argv): if (len(argv) != 2): print("Usage: python parse_source.py problem_url") scripts, url = argv hihocoder_url = 'http://hihocoder.com' lintcode_url = 'http://www.lintcode.com' leetcode_url = 'https://leetcode.com' h = html2text.HTML2Text() if url.startswith(lintcode_url): lintcode = Lintcode() src_body = lintcode.get_src_detail(url)['body'] print("### Problem Statement") print("") print(h.handle(src_body)) elif url.startswith(leetcode_url): leetcode = Leetcode() src_body = leetcode.get_src_detail(url)['body'] print("### Problem Statement") print("") print(h.handle(src_body)) else: # temp hihocoder = Hihocoder() src_title = hihocoder.get_src_detail(url)['title'] src_body = hihocoder.get_src_detail(url)['body'] print("# " + src_title) print("") print("## Question") print("") print("### Problem Statement") print("") print(h.handle(src_body)) if __name__ == "__main__": main(sys.argv)