diff --git a/README.md b/README.md new file mode 100644 index 0000000..20f0641 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# Fetcher + +Fetch news from urls. + +# Intro + +Fetch depth: 1 + +# Development Tips + +There files need to be modified while add a new site: +- main.go: add entrance url +- links.go -> SetLinks(): add case about target urls feature regex, eg: if url must have `about`, param 2 is `.*?about.*` +- site/newsite: copy files from sibling folder, then develop and pass the test. +- post.go -> TreatPost(): add case for new site domain. diff --git a/internal/fetcher/fetcher.go b/internal/fetcher/fetcher.go index 1d0074a..020180a 100644 --- a/internal/fetcher/fetcher.go +++ b/internal/fetcher/fetcher.go @@ -38,6 +38,13 @@ func BreadthFirst(f func(item string), worklist []string) { } func Crawl(_url string) { + defer func() { + if err := recover(); err != nil { + e := err.(error) + log.Println(e) + PanicLog(e) + } + }() f := FetcherFactory(_url) log.Printf("[*] Deal with: [%s]\n", _url) log.Println("[*] Fetch links ...") @@ -102,8 +109,8 @@ func DelRoutine(folder string, n int) error { return nil } -func ErrLog(msg string) error { - filePath := "./errLog.txt" +func PanicLog(_err error) error { + filePath := "./PanicLog.txt" file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) if err != nil { return err @@ -111,7 +118,21 @@ func ErrLog(msg string) error { defer file.Close() write := bufio.NewWriter(file) write.WriteString("[" + time.Now().Format(time.RFC3339) + "]--------------------------------------\n") - write.WriteString(msg + "\n") + write.WriteString(_err.Error() + "\n") write.Flush() return nil } + +func ErrLog(msg string) error { + // filePath := "./errLog.txt" + // file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) + // if err != nil { + // return err + // } + // defer file.Close() + // write := bufio.NewWriter(file) + // write.WriteString("[" + time.Now().Format(time.RFC3339) + "]--------------------------------------\n") + // write.WriteString(msg + "\n") + // write.Flush() + return nil +} diff --git a/internal/fetcher/fetcher_test.go b/internal/fetcher/fetcher_test.go index bc4fbc0..1c3b47b 100644 --- a/internal/fetcher/fetcher_test.go +++ b/internal/fetcher/fetcher_test.go @@ -10,9 +10,13 @@ func TestCrawl(t *testing.T) { for { BreadthFirst(Crawl, []string{ // "https://www.boxun.com/rolling.shtml", - "https://www.dwnews.com", + // "https://www.dwnews.com", + // "https://www.zaobao.com/realtime/world", + // "https://www.zaobao.com/news/world", // "https://www.voachinese.com", // "https://www.rfa.org/mandarin/", + // "https://news.ltn.com.tw/list/breakingnews", + "https://www.cna.com.tw/list/aall.aspx", }) log.Println("Sleep a sec ...") diff --git a/internal/fetcher/links.go b/internal/fetcher/links.go index 11b2106..fa0b8dc 100644 --- a/internal/fetcher/links.go +++ b/internal/fetcher/links.go @@ -34,6 +34,20 @@ func (f *Fetcher) SetLinks() error { case "www.rfa.org": f.Links = LinksFilter(links, `.*?/.*?-\d*.html`) KickOutLinksMatchPath(&f.Links, "about") + case "www.zaobao.com": + newsWorld := LinksFilter(links, `.*?/news/world/.*`) + newsChina := LinksFilter(links, `.*?/news/china/.*`) + realtimeWorld := LinksFilter(links, `.*?/realtime/world/.*`) + realtimeChina := LinksFilter(links, `.*?/realtime/china/.*`) + f.Links = append(append(append(newsWorld, newsChina...), realtimeWorld...), realtimeChina...) + case "news.ltn.com.tw": + f.Links = LinksFilter(links, `https://news.*/news/.*`) + case "www.cna.com.tw": + newsFirst := LinksFilter(links, `.*?/news/firstnews/.*`) + newsWorld := LinksFilter(links, `.*?/news/aopl/.*`) + newsPolitical := LinksFilter(links, `.*?/news/aipl/.*`) + newsTW := LinksFilter(links, `.*?/news/acn/.*`) + f.Links = append(append(append(newsFirst, newsWorld...), newsPolitical...), newsTW...) } return nil } diff --git a/internal/fetcher/links_test.go b/internal/fetcher/links_test.go index 74634cd..822e6a8 100644 --- a/internal/fetcher/links_test.go +++ b/internal/fetcher/links_test.go @@ -1,7 +1,6 @@ package fetcher import ( - "fmt" "net/url" "testing" ) @@ -18,25 +17,36 @@ func TestKickOutLinksMatchPath(t *testing.T) { } func TestSetLinks(t *testing.T) { - u, err := url.Parse("https://www.voachinese.com") + // u, err := url.Parse("https://news.ltn.com.tw/list/breakingnews") + // assertLinks := []string{ + // "https://news.ltn.com.tw/news/society/breakingnews/3278253", + // "https://news.ltn.com.tw/news/society/breakingnews/3278250", + // "https://news.ltn.com.tw/news/politics/breakingnews/3278225", + // "https://news.ltn.com.tw/news/politics/breakingnews/3278170", + // } + u, err := url.Parse("https://www.cna.com.tw/list/aall.aspx") + assertLinks := []string{ + "https://www.cna.com.tw/news/aopl/202009290075.aspx", + "https://www.cna.com.tw/news/firstnews/202009290051.aspx", + "https://www.cna.com.tw/news/acn/202009290063.aspx", + "https://www.cna.com.tw/news/aipl/202009290055.aspx", + } if err != nil { t.Errorf("Url Parse fail!\n%s", err) } var f = &Fetcher{ Entrance: u, - // Entrance: "https://www.voachinese.com", } f.SetLinks() - // assertLink := "https://www.voachinese.com/a/who-remains-tight-lipped-experts-sent-investigate-coronavirus-china-20200713/5500866.html" - assertLink := "https://www.voachinese.com/a/fire-still-raging-aboard-navy-ship-docked-in-california-20200713/5500960.html" shot := 0 for _, link := range f.Links { - fmt.Println(link) - if link == assertLink { - shot++ + for _, v := range assertLinks { + if link == v { + shot++ + } } } - if shot == 0 { - t.Errorf("want: %v, got: %v", 1, shot) + if shot != len(assertLinks) { + t.Errorf("want: %v, got: %v", len(assertLinks), shot) } } diff --git a/internal/fetcher/post.go b/internal/fetcher/post.go index 6b8b82f..03f5ce9 100644 --- a/internal/fetcher/post.go +++ b/internal/fetcher/post.go @@ -6,13 +6,18 @@ import ( "io/ioutil" "log" "net/url" + "os" "path/filepath" + "strings" "time" "github.com/wedojava/fetcher/internal/fetcher/sites/boxun" + "github.com/wedojava/fetcher/internal/fetcher/sites/cna" "github.com/wedojava/fetcher/internal/fetcher/sites/dwnews" + "github.com/wedojava/fetcher/internal/fetcher/sites/ltn" "github.com/wedojava/fetcher/internal/fetcher/sites/rfa" "github.com/wedojava/fetcher/internal/fetcher/sites/voachinese" + "github.com/wedojava/fetcher/internal/fetcher/sites/zaobao" "github.com/wedojava/fetcher/internal/htmldoc" "github.com/wedojava/gears" "golang.org/x/net/html" @@ -87,35 +92,69 @@ func (p *Post) TreatPost() error { return err } *p = Post(post) + case "www.zaobao.com": + post := zaobao.Post(*p) + if err := zaobao.SetPost(&post); err != nil { + return err + } + *p = Post(post) + case "news.ltn.com.tw": + post := ltn.Post(*p) + if err := ltn.SetPost(&post); err != nil { + return err + } + *p = Post(post) + case "www.cna.com.tw": + post := cna.Post(*p) + if err := cna.SetPost(&post); err != nil { + return err + } + *p = Post(post) + default: + return fmt.Errorf("switch no case on: %s", p.Domain) } // Save post to file - if err := p.SetFilename(); err != nil { + if err := p.setFilename(); err != nil { return err } - if err := p.SavePost(); err != nil { + if err := p.savePost(); err != nil { return err } return nil } -func (p *Post) SavePost() error { +func (p *Post) savePost() error { folderPath := filepath.Join("wwwroot", p.Domain) gears.MakeDirAll(folderPath) if p.Filename == "" { - return errors.New("SavePost need a filename, but got none.") + return errors.New("savePost need a filename, but got none.") + } + fpath := filepath.Join(folderPath, p.Filename) + // !+ rm files with same title + files, err := ioutil.ReadDir(folderPath) + if err != nil { + return err + } + for _, f := range files { + if !f.IsDir() && strings.Contains(f.Name(), p.Title) { + err = os.Remove(filepath.Join(folderPath, f.Name())) + if err != nil { + return err + } + } } - filepath := filepath.Join(folderPath, p.Filename) + // !- rm files with same title if p.Body == "" { - p.Body = "[-] Fetch error on visit: " + p.URL.String() + p.Body = "savePost p.Body = \"\"" } - err := ioutil.WriteFile(filepath, []byte(p.Body), 0644) + err = ioutil.WriteFile(fpath, []byte(p.Body), 0644) if err != nil { return err } return nil } -func (p *Post) SetFilename() error { +func (p *Post) setFilename() error { t, err := time.Parse(time.RFC3339, p.Date) if err != nil { return err diff --git a/internal/fetcher/post_test.go b/internal/fetcher/post_test.go index 01308c2..a861833 100644 --- a/internal/fetcher/post_test.go +++ b/internal/fetcher/post_test.go @@ -2,6 +2,7 @@ package fetcher import ( "fmt" + "log" "testing" "time" @@ -10,7 +11,9 @@ import ( func TestSetAndSavePost(t *testing.T) { // p := PostFactory("https://www.dwnews.com/经济/60203253") - p := PostFactory("https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html") // The wrong one + // p := PostFactory("https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html") // The wrong one + // p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") + p := PostFactory("https://www.cna.com.tw/news/aopl/202009290075.aspx") raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) if err != nil { t.Errorf("GetRawAndDoC error: %v", err) @@ -25,15 +28,23 @@ func TestSetAndSavePost(t *testing.T) { func TestTreatPost(t *testing.T) { tcs := []string{ - "https://www.boxun.com/news/gb/taiwan/2020/07/202007091815.shtml", - "https://www.dwnews.com/经济/60203253", - "https://www.dwnews.com/全球/60203234", - "https://www.voachinese.com/a/S-Korea-Says-US-Sees-Importance-Of-N-Korea-Talks-Despite-Tension-20200709/5496028.html", - "https://www.rfa.org/mandarin/yataibaodao/shaoshuminzu/gf1-07092020074142.html", - "https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html", + // "https://www.boxun.com/news/gb/taiwan/2020/07/202007091815.shtml", + // "https://www.dwnews.com/经济/60203253", + // "https://www.dwnews.com/全球/60203234", + // "https://www.voachinese.com/a/S-Korea-Says-US-Sees-Importance-Of-N-Korea-Talks-Despite-Tension-20200709/5496028.html", + // "https://www.rfa.org/mandarin/yataibaodao/shaoshuminzu/gf1-07092020074142.html", + // "https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html", + // "https://www.zaobao.com/realtime/world/story20200825-1079575", + // "https://www.zaobao.com/news/world/story20200825-1079477", + // "https://www.zaobao.com.sg/realtime/world/story20200901-1081441", + // "https://news.ltn.com.tw/news/world/breakingnews/3278726", + "https://www.cna.com.tw/news/aopl/202009290075.aspx", } for _, tc := range tcs { p := PostFactory(tc) - p.TreatPost() + err := p.TreatPost() + if err != nil { + log.Println(err) + } } } diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go new file mode 100644 index 0000000..d0c2a1b --- /dev/null +++ b/internal/fetcher/sites/cna/cna.go @@ -0,0 +1,156 @@ +package cna + +import ( + "errors" + "fmt" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" + "github.com/wedojava/gears" + "golang.org/x/net/html" +) + +type Post struct { + Domain string + URL *url.URL + DOC *html.Node + Raw []byte + Title string + Body string + Date string + Filename string +} + +func SetPost(p *Post) error { + if err := setDate(p); err != nil { + return err + } + if err := setTitle(p); err != nil { + return err + } + if err := setBody(p); err != nil { + return err + } + return nil +} + +func setDate(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + metas := htmldoc.MetasByItemprop(p.DOC, "dateModified") + cs := []string{} + for _, meta := range metas { + for _, a := range meta.Attr { + if a.Key == "content" { + cs = append(cs, a.Val) + } + } + } + if len(cs) <= 0 { + return fmt.Errorf("SetData got nothing.") + } + tY := cs[0][:4] + tM := cs[0][5:7] + tD := cs[0][8:10] + tH := cs[0][11:13] + tm := cs[0][14:16] + yy, err := strconv.Atoi(tY) + mm, err := strconv.Atoi(tM) + dd, err := strconv.Atoi(tD) + h, err := strconv.Atoi(tH) + m, err := strconv.Atoi(tm) + if err != nil { + return err + } + // China doesn't have daylight saving. It uses a fixed 8 hour offset from UTC. + secondsEastOfUTC := int((8 * time.Hour).Seconds()) + beijing := time.FixedZone("Beijing Time", secondsEastOfUTC) + t := time.Date(yy, time.Month(mm), dd, h, m, 0, 0, beijing) + p.Date = t.Format(time.RFC3339) + + return nil +} + +func setTitle(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + n := htmldoc.ElementsByTag(p.DOC, "title") + if n == nil { + return fmt.Errorf("[-] there is no element
(.*?)
`) + for _, v := range re.FindAllSubmatch(r, -1) { + ps = append(ps, v[1]) + } + if len(ps) == 0 { + return "", fmt.Errorf("no matched")
+ }
+ for _, p := range ps {
+ b.Write(p)
+ b.Write([]byte(" \n"))
+ }
+ body := b.String()
+ re = regexp.MustCompile(`「`)
+ body = re.ReplaceAllString(body, "“")
+ re = regexp.MustCompile(`」`)
+ body = re.ReplaceAllString(body, "”")
+ re = regexp.MustCompile(`
test text
交通部長林佳龍28日向行政院長蘇貞昌報告安心旅遊成效,蘇貞昌支持交通部觀光局安心旅遊補助續辦到10月底,預計中秋及雙十連假出遊民眾,仍可享用這項補助。 (看完整報導)
+國發會報喜,28日宣布8月景氣燈號脫離黃藍燈、轉為代表穩定的綠燈,綜合判斷分數也較上月大增5分、升至26分,國發會認為,景氣從低點反轉向上的跡象愈來愈明顯,台灣經濟漸趨回穩。 (看完整報導)
+武漢肺炎從中國爆發並蔓延全球迄今不到一年,迄今已逾百萬人喪命。世衛警告,若不採取更多集體行動,病故人數恐倍增。另一方面,中國正大範圍為特定人員接種2019冠狀病毒疾病疫苗,紐時報導,沒有其他國家在常規藥物試驗程序之外,以如此之大的規模給人們注射未經檢驗的疫苗。中國這種急切的做法相當於一場豪賭。(看完整報導)
+高加索地區的亞塞拜然與亞美尼亞28日進入交戰第2日,已導致數十人喪生,雙方為這場戰鬥指責彼此,世界領袖已促請雙方冷靜,各界擔心這場戰鬥引發全面衝突,可能捲入區域大國俄羅斯和土耳其。「雙亞」夙怨糾結30年,因主張脫離亞塞拜然的納哥諾卡拉巴克地區迭起紛爭。 (看完整報導)
+川普政府原訂28日起禁止美國用戶下載熱門短影音分享應用程式TikTok,聯邦法官27日在最後一刻做出裁決,暫緩實施這項具有政治色彩的禁令,讓TikTok暫時逃過一劫。 (看完整報導)
+導演鍾孟宏執導的電影「陽光普照」,將代表台灣角逐美國第93屆奧斯卡最佳國際影片獎。文化部28日表示,2020年共有18部國片報名,經過甄選後推薦由「陽光普照」代表台灣參賽,甄選委員認為本片「親子議題刻劃深刻,觸動人心,製作品質領先群倫,備受影展肯定,國際能見度佳。」(看完整報導)
+一名菲律賓移工檢疫期滿採檢,確診武漢肺炎。但在採檢結果出爐前,個案已進到社區活動約1.5天時間。疫情指揮中心28日表示,現已改為檢疫期滿前採檢,未來不會發生類似狀況。 (看完整報導)
+藝人歐陽娜娜、張韶涵傳將在中國國慶晚會獻唱「我的祖國」等歌曲,文化部28日表示,如果大陸委員會確認違法,最重可處新台幣50萬元罰鍰。此外,阿美族人楊品驊在海峽論壇上自稱是中國人,原民會主委夷將.拔路兒28日表示,不反對與中國大陸互動,但必須認清自己是中華民國國民,不是中國人;不能忍受用個人的名義代表整個族群。 (看完整報導)
+美國總統大選首場電視辯論即將登場之際,川普的財務紀錄又掀爭議。「紐約時報」27日爆料,川普在贏得大選的2016年,只繳了750美元(約新台幣2萬2000元)聯邦所得稅。川普對紐時的報導不屑一顧,他說自己「付了很多錢,也繳了很多的州所得稅」。 (看完整報導)
+法國經濟學家皮凱提新書無法在中國出版,只是反映出中國審查制度的冰山一角。本地編輯表示,現在有多重出版禁忌,且有時不知道紅線劃在哪裡,出好書越來越難,因為審核制度越來越嚴格,大家選書趨於保守。(看完整報導)
+上午8點同步發送電子報!快來訂閱「早安世界」給你最精華的新聞摘要。
若有任何建議請來信告訴我們,想獲得更多最新資訊快來和中央社做朋友。
歡迎訂閱中央社國際新聞電子報,每週三、日發報,掌握世界脈動。