diff --git a/README.md b/README.md new file mode 100644 index 0000000..20f0641 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# Fetcher + +Fetch news from urls. + +# Intro + +Fetch depth: 1 + +# Development Tips + +There files need to be modified while add a new site: +- main.go: add entrance url +- links.go -> SetLinks(): add case about target urls feature regex, eg: if url must have `about`, param 2 is `.*?about.*` +- site/newsite: copy files from sibling folder, then develop and pass the test. +- post.go -> TreatPost(): add case for new site domain. diff --git a/internal/fetcher/fetcher.go b/internal/fetcher/fetcher.go index 1d0074a..020180a 100644 --- a/internal/fetcher/fetcher.go +++ b/internal/fetcher/fetcher.go @@ -38,6 +38,13 @@ func BreadthFirst(f func(item string), worklist []string) { } func Crawl(_url string) { + defer func() { + if err := recover(); err != nil { + e := err.(error) + log.Println(e) + PanicLog(e) + } + }() f := FetcherFactory(_url) log.Printf("[*] Deal with: [%s]\n", _url) log.Println("[*] Fetch links ...") @@ -102,8 +109,8 @@ func DelRoutine(folder string, n int) error { return nil } -func ErrLog(msg string) error { - filePath := "./errLog.txt" +func PanicLog(_err error) error { + filePath := "./PanicLog.txt" file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) if err != nil { return err @@ -111,7 +118,21 @@ func ErrLog(msg string) error { defer file.Close() write := bufio.NewWriter(file) write.WriteString("[" + time.Now().Format(time.RFC3339) + "]--------------------------------------\n") - write.WriteString(msg + "\n") + write.WriteString(_err.Error() + "\n") write.Flush() return nil } + +func ErrLog(msg string) error { + // filePath := "./errLog.txt" + // file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) + // if err != nil { + // return err + // } + // defer file.Close() + // write := bufio.NewWriter(file) + // write.WriteString("[" + time.Now().Format(time.RFC3339) + "]--------------------------------------\n") + // write.WriteString(msg + "\n") + // write.Flush() + return nil +} diff --git a/internal/fetcher/fetcher_test.go b/internal/fetcher/fetcher_test.go index bc4fbc0..1c3b47b 100644 --- a/internal/fetcher/fetcher_test.go +++ b/internal/fetcher/fetcher_test.go @@ -10,9 +10,13 @@ func TestCrawl(t *testing.T) { for { BreadthFirst(Crawl, []string{ // "https://www.boxun.com/rolling.shtml", - "https://www.dwnews.com", + // "https://www.dwnews.com", + // "https://www.zaobao.com/realtime/world", + // "https://www.zaobao.com/news/world", // "https://www.voachinese.com", // "https://www.rfa.org/mandarin/", + // "https://news.ltn.com.tw/list/breakingnews", + "https://www.cna.com.tw/list/aall.aspx", }) log.Println("Sleep a sec ...") diff --git a/internal/fetcher/links.go b/internal/fetcher/links.go index 11b2106..fa0b8dc 100644 --- a/internal/fetcher/links.go +++ b/internal/fetcher/links.go @@ -34,6 +34,20 @@ func (f *Fetcher) SetLinks() error { case "www.rfa.org": f.Links = LinksFilter(links, `.*?/.*?-\d*.html`) KickOutLinksMatchPath(&f.Links, "about") + case "www.zaobao.com": + newsWorld := LinksFilter(links, `.*?/news/world/.*`) + newsChina := LinksFilter(links, `.*?/news/china/.*`) + realtimeWorld := LinksFilter(links, `.*?/realtime/world/.*`) + realtimeChina := LinksFilter(links, `.*?/realtime/china/.*`) + f.Links = append(append(append(newsWorld, newsChina...), realtimeWorld...), realtimeChina...) + case "news.ltn.com.tw": + f.Links = LinksFilter(links, `https://news.*/news/.*`) + case "www.cna.com.tw": + newsFirst := LinksFilter(links, `.*?/news/firstnews/.*`) + newsWorld := LinksFilter(links, `.*?/news/aopl/.*`) + newsPolitical := LinksFilter(links, `.*?/news/aipl/.*`) + newsTW := LinksFilter(links, `.*?/news/acn/.*`) + f.Links = append(append(append(newsFirst, newsWorld...), newsPolitical...), newsTW...) } return nil } diff --git a/internal/fetcher/links_test.go b/internal/fetcher/links_test.go index 74634cd..822e6a8 100644 --- a/internal/fetcher/links_test.go +++ b/internal/fetcher/links_test.go @@ -1,7 +1,6 @@ package fetcher import ( - "fmt" "net/url" "testing" ) @@ -18,25 +17,36 @@ func TestKickOutLinksMatchPath(t *testing.T) { } func TestSetLinks(t *testing.T) { - u, err := url.Parse("https://www.voachinese.com") + // u, err := url.Parse("https://news.ltn.com.tw/list/breakingnews") + // assertLinks := []string{ + // "https://news.ltn.com.tw/news/society/breakingnews/3278253", + // "https://news.ltn.com.tw/news/society/breakingnews/3278250", + // "https://news.ltn.com.tw/news/politics/breakingnews/3278225", + // "https://news.ltn.com.tw/news/politics/breakingnews/3278170", + // } + u, err := url.Parse("https://www.cna.com.tw/list/aall.aspx") + assertLinks := []string{ + "https://www.cna.com.tw/news/aopl/202009290075.aspx", + "https://www.cna.com.tw/news/firstnews/202009290051.aspx", + "https://www.cna.com.tw/news/acn/202009290063.aspx", + "https://www.cna.com.tw/news/aipl/202009290055.aspx", + } if err != nil { t.Errorf("Url Parse fail!\n%s", err) } var f = &Fetcher{ Entrance: u, - // Entrance: "https://www.voachinese.com", } f.SetLinks() - // assertLink := "https://www.voachinese.com/a/who-remains-tight-lipped-experts-sent-investigate-coronavirus-china-20200713/5500866.html" - assertLink := "https://www.voachinese.com/a/fire-still-raging-aboard-navy-ship-docked-in-california-20200713/5500960.html" shot := 0 for _, link := range f.Links { - fmt.Println(link) - if link == assertLink { - shot++ + for _, v := range assertLinks { + if link == v { + shot++ + } } } - if shot == 0 { - t.Errorf("want: %v, got: %v", 1, shot) + if shot != len(assertLinks) { + t.Errorf("want: %v, got: %v", len(assertLinks), shot) } } diff --git a/internal/fetcher/post.go b/internal/fetcher/post.go index 6b8b82f..03f5ce9 100644 --- a/internal/fetcher/post.go +++ b/internal/fetcher/post.go @@ -6,13 +6,18 @@ import ( "io/ioutil" "log" "net/url" + "os" "path/filepath" + "strings" "time" "github.com/wedojava/fetcher/internal/fetcher/sites/boxun" + "github.com/wedojava/fetcher/internal/fetcher/sites/cna" "github.com/wedojava/fetcher/internal/fetcher/sites/dwnews" + "github.com/wedojava/fetcher/internal/fetcher/sites/ltn" "github.com/wedojava/fetcher/internal/fetcher/sites/rfa" "github.com/wedojava/fetcher/internal/fetcher/sites/voachinese" + "github.com/wedojava/fetcher/internal/fetcher/sites/zaobao" "github.com/wedojava/fetcher/internal/htmldoc" "github.com/wedojava/gears" "golang.org/x/net/html" @@ -87,35 +92,69 @@ func (p *Post) TreatPost() error { return err } *p = Post(post) + case "www.zaobao.com": + post := zaobao.Post(*p) + if err := zaobao.SetPost(&post); err != nil { + return err + } + *p = Post(post) + case "news.ltn.com.tw": + post := ltn.Post(*p) + if err := ltn.SetPost(&post); err != nil { + return err + } + *p = Post(post) + case "www.cna.com.tw": + post := cna.Post(*p) + if err := cna.SetPost(&post); err != nil { + return err + } + *p = Post(post) + default: + return fmt.Errorf("switch no case on: %s", p.Domain) } // Save post to file - if err := p.SetFilename(); err != nil { + if err := p.setFilename(); err != nil { return err } - if err := p.SavePost(); err != nil { + if err := p.savePost(); err != nil { return err } return nil } -func (p *Post) SavePost() error { +func (p *Post) savePost() error { folderPath := filepath.Join("wwwroot", p.Domain) gears.MakeDirAll(folderPath) if p.Filename == "" { - return errors.New("SavePost need a filename, but got none.") + return errors.New("savePost need a filename, but got none.") + } + fpath := filepath.Join(folderPath, p.Filename) + // !+ rm files with same title + files, err := ioutil.ReadDir(folderPath) + if err != nil { + return err + } + for _, f := range files { + if !f.IsDir() && strings.Contains(f.Name(), p.Title) { + err = os.Remove(filepath.Join(folderPath, f.Name())) + if err != nil { + return err + } + } } - filepath := filepath.Join(folderPath, p.Filename) + // !- rm files with same title if p.Body == "" { - p.Body = "[-] Fetch error on visit: " + p.URL.String() + p.Body = "savePost p.Body = \"\"" } - err := ioutil.WriteFile(filepath, []byte(p.Body), 0644) + err = ioutil.WriteFile(fpath, []byte(p.Body), 0644) if err != nil { return err } return nil } -func (p *Post) SetFilename() error { +func (p *Post) setFilename() error { t, err := time.Parse(time.RFC3339, p.Date) if err != nil { return err diff --git a/internal/fetcher/post_test.go b/internal/fetcher/post_test.go index 01308c2..a861833 100644 --- a/internal/fetcher/post_test.go +++ b/internal/fetcher/post_test.go @@ -2,6 +2,7 @@ package fetcher import ( "fmt" + "log" "testing" "time" @@ -10,7 +11,9 @@ import ( func TestSetAndSavePost(t *testing.T) { // p := PostFactory("https://www.dwnews.com/经济/60203253") - p := PostFactory("https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html") // The wrong one + // p := PostFactory("https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html") // The wrong one + // p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") + p := PostFactory("https://www.cna.com.tw/news/aopl/202009290075.aspx") raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) if err != nil { t.Errorf("GetRawAndDoC error: %v", err) @@ -25,15 +28,23 @@ func TestSetAndSavePost(t *testing.T) { func TestTreatPost(t *testing.T) { tcs := []string{ - "https://www.boxun.com/news/gb/taiwan/2020/07/202007091815.shtml", - "https://www.dwnews.com/经济/60203253", - "https://www.dwnews.com/全球/60203234", - "https://www.voachinese.com/a/S-Korea-Says-US-Sees-Importance-Of-N-Korea-Talks-Despite-Tension-20200709/5496028.html", - "https://www.rfa.org/mandarin/yataibaodao/shaoshuminzu/gf1-07092020074142.html", - "https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html", + // "https://www.boxun.com/news/gb/taiwan/2020/07/202007091815.shtml", + // "https://www.dwnews.com/经济/60203253", + // "https://www.dwnews.com/全球/60203234", + // "https://www.voachinese.com/a/S-Korea-Says-US-Sees-Importance-Of-N-Korea-Talks-Despite-Tension-20200709/5496028.html", + // "https://www.rfa.org/mandarin/yataibaodao/shaoshuminzu/gf1-07092020074142.html", + // "https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html", + // "https://www.zaobao.com/realtime/world/story20200825-1079575", + // "https://www.zaobao.com/news/world/story20200825-1079477", + // "https://www.zaobao.com.sg/realtime/world/story20200901-1081441", + // "https://news.ltn.com.tw/news/world/breakingnews/3278726", + "https://www.cna.com.tw/news/aopl/202009290075.aspx", } for _, tc := range tcs { p := PostFactory(tc) - p.TreatPost() + err := p.TreatPost() + if err != nil { + log.Println(err) + } } } diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go new file mode 100644 index 0000000..d0c2a1b --- /dev/null +++ b/internal/fetcher/sites/cna/cna.go @@ -0,0 +1,156 @@ +package cna + +import ( + "errors" + "fmt" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" + "github.com/wedojava/gears" + "golang.org/x/net/html" +) + +type Post struct { + Domain string + URL *url.URL + DOC *html.Node + Raw []byte + Title string + Body string + Date string + Filename string +} + +func SetPost(p *Post) error { + if err := setDate(p); err != nil { + return err + } + if err := setTitle(p); err != nil { + return err + } + if err := setBody(p); err != nil { + return err + } + return nil +} + +func setDate(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + metas := htmldoc.MetasByItemprop(p.DOC, "dateModified") + cs := []string{} + for _, meta := range metas { + for _, a := range meta.Attr { + if a.Key == "content" { + cs = append(cs, a.Val) + } + } + } + if len(cs) <= 0 { + return fmt.Errorf("SetData got nothing.") + } + tY := cs[0][:4] + tM := cs[0][5:7] + tD := cs[0][8:10] + tH := cs[0][11:13] + tm := cs[0][14:16] + yy, err := strconv.Atoi(tY) + mm, err := strconv.Atoi(tM) + dd, err := strconv.Atoi(tD) + h, err := strconv.Atoi(tH) + m, err := strconv.Atoi(tm) + if err != nil { + return err + } + // China doesn't have daylight saving. It uses a fixed 8 hour offset from UTC. + secondsEastOfUTC := int((8 * time.Hour).Seconds()) + beijing := time.FixedZone("Beijing Time", secondsEastOfUTC) + t := time.Date(yy, time.Month(mm), dd, h, m, 0, 0, beijing) + p.Date = t.Format(time.RFC3339) + + return nil +} + +func setTitle(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + n := htmldoc.ElementsByTag(p.DOC, "title") + if n == nil { + return fmt.Errorf("[-] there is no element ") + } + title := n[0].FirstChild.Data + if strings.Contains(title, "| 娛樂 |") || + strings.Contains(title, "| 政治 |") || + strings.Contains(title, "| 兩岸 |") || + strings.Contains(title, "| 運動 |") || + strings.Contains(title, "| 文化 |") || + strings.Contains(title, "| 地方 |") || + strings.Contains(title, "| 社會 |") || + strings.Contains(title, "| 生活 |") || + strings.Contains(title, "| 科技 |") || + strings.Contains(title, "| 證券 |") || + strings.Contains(title, "| 產經 |") { + return errors.New("ignore post on purpose: " + p.URL.String()) + } + title = strings.ReplaceAll(title, " | 中央社 CNA", "") + title = strings.TrimSpace(title) + gears.ReplaceIllegalChar(&title) + p.Title = title + return nil +} + +func setBody(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + b, err := cna(p) + if err != nil { + return err + } + t, err := time.Parse(time.RFC3339, p.Date) + if err != nil { + return err + } + h1 := fmt.Sprintf("# [%02d.%02d][%02d%02dH] %s", t.Month(), t.Day(), t.Hour(), t.Minute(), p.Title) + p.Body = h1 + "\n\n" + b + "\n\n原地址:" + p.URL.String() + return nil +} + +func cna(p *Post) (string, error) { + if p.DOC == nil { + return "", fmt.Errorf("[-] p.DOC is nil") + } + doc := p.DOC + body := "" + // Fetch content nodes + nodes := htmldoc.ElementsByTagAndClass(doc, "div", "paragraph") + if len(nodes) == 0 { + return "", errors.New("[-] There is no element class is paragraph` from: " + p.URL.String()) + } + n := nodes[0] + plist := htmldoc.ElementsByTag(n, "h2", "p") + for _, v := range plist { + if v.FirstChild != nil { + body += v.FirstChild.Data + " \n" + } + } + + body = strings.ReplaceAll(body, "「", "“") + body = strings.ReplaceAll(body, "」", "”") + body = strings.ReplaceAll(body, "</a>", "") + + re := regexp.MustCompile(`<a.*?>`) + body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`<script.*?</script>`) + body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`<iframe.*?</iframe>`) + body = re.ReplaceAllString(body, "") + + return body, nil +} diff --git a/internal/fetcher/sites/cna/cna_test.go b/internal/fetcher/sites/cna/cna_test.go new file mode 100644 index 0000000..5fb3761 --- /dev/null +++ b/internal/fetcher/sites/cna/cna_test.go @@ -0,0 +1,78 @@ +package cna + +import ( + "fmt" + "log" + "net/url" + "testing" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" +) + +var p = PostFactory("https://www.cna.com.tw/news/aopl/202009300058.aspx") + +func PostFactory(rawurl string) *Post { + url, err := url.Parse(rawurl) + if err != nil { + log.Printf("url parse err: %s", err) + } + return &Post{ + Domain: url.Hostname(), + URL: url, + } +} + +func TestSetDate(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := setDate(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + want := "2020-09-30T10:54:00+08:00" + if p.Date != want { + t.Errorf("\ngot: %v\nwant: %v", p.Date, want) + } +} + +func TestSetTitle(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := setTitle(p); err != nil { + t.Errorf("test SetPost err: %v", err) + } + want := "被爆10年沒繳稅 川普:避稅計畫展現我的才智 | 國際" + if p.Title != want { + t.Errorf("\ngot: %v\nwant: %v", p.Title, want) + } +} + +func TestCna(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + tc, err := cna(p) + fmt.Println(tc) +} + +func TestSetPost(t *testing.T) { + var p = PostFactory("https://www.cna.com.tw/news/afe/202009290241.aspx") // should be ignore + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := SetPost(p); err != nil { + t.Errorf("test SetPost err: %v", err) + } + fmt.Println(p.Title) + fmt.Println(p.Body) +} diff --git a/internal/fetcher/sites/dwnews/dwnews.go b/internal/fetcher/sites/dwnews/dwnews.go index 4157594..c33e941 100644 --- a/internal/fetcher/sites/dwnews/dwnews.go +++ b/internal/fetcher/sites/dwnews/dwnews.go @@ -105,7 +105,8 @@ func Dwnews(p *Post) (string, error) { } articleDoc := nodes[0].FirstChild plist := htmldoc.ElementsByTag(articleDoc, "p") - if articleDoc.FirstChild.Data == "div" { // to fetch the summary block + if articleDoc.FirstChild != nil && + articleDoc.FirstChild.Data == "div" { // to fetch the summary block // body += fmt.Sprintf("\n > %s \n", plist[0].FirstChild.Data) // redundant summary body += fmt.Sprintf("\n > ") } diff --git a/internal/fetcher/sites/ltn/ltn.go b/internal/fetcher/sites/ltn/ltn.go new file mode 100644 index 0000000..d9edce9 --- /dev/null +++ b/internal/fetcher/sites/ltn/ltn.go @@ -0,0 +1,145 @@ +package ltn + +import ( + "bytes" + "errors" + "fmt" + "net/url" + "regexp" + "strings" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" + "github.com/wedojava/gears" + "golang.org/x/net/html" +) + +type Post struct { + Domain string + URL *url.URL + DOC *html.Node + Raw []byte + Title string + Body string + Date string + Filename string +} + +func SetPost(p *Post) error { + if err := setDate(p); err != nil { + return err + } + if err := setTitle(p); err != nil { + return err + } + if err := setBody(p); err != nil { + return err + } + return nil +} + +func setDate(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + metas := htmldoc.MetasByProperty(p.DOC, "article:published_time") + cs := []string{} + for _, meta := range metas { + for _, a := range meta.Attr { + if a.Key == "content" { + cs = append(cs, a.Val) + } + } + } + if len(cs) <= 0 { + return fmt.Errorf("SetData got nothing.") + } + p.Date = cs[0] + return nil +} + +func setTitle(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + n := htmldoc.ElementsByTag(p.DOC, "title") + if n == nil { + return fmt.Errorf("[-] there is no element <title>") + } + title := n[0].FirstChild.Data + if strings.Contains(title, "- 娛樂") || + strings.Contains(title, "- 食譜") || + strings.Contains(title, "- 地產") || + strings.Contains(title, "- 體育") || + strings.Contains(title, "- 地方") || + strings.Contains(title, "- 蒐奇") || + strings.Contains(title, "- 社會") || + strings.Contains(title, "- 生活") || + strings.Contains(title, "- 时尚") || + strings.Contains(title, "- 健康") || + strings.Contains(title, "- 汽車") || + strings.Contains(title, "- 財經") { + return errors.New("ignore post on purpose: " + p.URL.String()) + } + title = strings.ReplaceAll(title, " - 自由時報電子報", "") + title = strings.TrimSpace(title) + gears.ReplaceIllegalChar(&title) + p.Title = title + return nil +} + +func setBody(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + b, err := ltn(p) + if err != nil { + return err + } + t, err := time.Parse(time.RFC3339, p.Date) + if err != nil { + return err + } + h1 := fmt.Sprintf("# [%02d.%02d][%02d%02dH] %s", t.Month(), t.Day(), t.Hour(), t.Minute(), p.Title) + p.Body = h1 + "\n\n" + b + "\n\n原地址:" + p.URL.String() + return nil +} + +func ltn(p *Post) (string, error) { + if p.Raw == nil { + return "", fmt.Errorf("[-] p.Raw is nil") + } + raw := p.Raw + // Fetch content nodes + r := htmldoc.DivWithAttr2(raw, "data-desc", "內容頁") + ps := [][]byte{} + b := bytes.Buffer{} + re := regexp.MustCompile(`<p>(.*?)</p>`) + for _, v := range re.FindAllSubmatch(r, -1) { + ps = append(ps, v[1]) + } + if len(ps) == 0 { + return "", fmt.Errorf("no <p> matched") + } + for _, p := range ps { + b.Write(p) + b.Write([]byte(" \n")) + } + body := b.String() + re = regexp.MustCompile(`「`) + body = re.ReplaceAllString(body, "“") + re = regexp.MustCompile(`」`) + body = re.ReplaceAllString(body, "”") + re = regexp.MustCompile(`<a.*?>`) + body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`</a>`) + body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`<script.*?</script>`) + body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`<blockquote.*?</blockquote>`) + body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`<iframe.*?</iframe>`) + body = re.ReplaceAllString(body, "") + + return body, nil +} diff --git a/internal/fetcher/sites/ltn/ltn_test.go b/internal/fetcher/sites/ltn/ltn_test.go new file mode 100644 index 0000000..d4f5e50 --- /dev/null +++ b/internal/fetcher/sites/ltn/ltn_test.go @@ -0,0 +1,77 @@ +package ltn + +import ( + "fmt" + "log" + "net/url" + "testing" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" +) + +var p = PostFactory("https://news.ltn.com.tw/news/world/breakingnews/3277899") + +func PostFactory(rawurl string) *Post { + url, err := url.Parse(rawurl) + if err != nil { + log.Printf("url parse err: %s", err) + } + return &Post{ + Domain: url.Hostname(), + URL: url, + } +} + +func TestSetDate(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := setDate(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + want := "2020-09-01T12:14:01+08:00" + if p.Date != want { + t.Errorf("\ngot: %v\nwant: %v", p.Date, want) + } +} + +func TestSetTitle(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := setTitle(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + want := "反送中12港青逃台被逮 林鄭月娥暗示應「送中」 - 國際" + if p.Title != want { + t.Errorf("\ngot: %v\nwant: %v", p.Title, want) + } +} + +func TestLtn(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + tc, err := ltn(p) + fmt.Println(tc) +} + +func TestSetPost(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := SetPost(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + fmt.Println(p.Title) + fmt.Println(p.Body) +} diff --git a/internal/fetcher/sites/zaobao/zaobao.go b/internal/fetcher/sites/zaobao/zaobao.go new file mode 100644 index 0000000..0de223d --- /dev/null +++ b/internal/fetcher/sites/zaobao/zaobao.go @@ -0,0 +1,124 @@ +package zaobao + +import ( + "errors" + "fmt" + "net/url" + "strings" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" + "github.com/wedojava/gears" + "golang.org/x/net/html" +) + +type Post struct { + Domain string + URL *url.URL + DOC *html.Node + Raw []byte + Title string + Body string + Date string + Filename string +} + +func SetPost(p *Post) error { + if err := SetDate(p); err != nil { + return err + } + if err := SetTitle(p); err != nil { + return err + } + if err := SetBody(p); err != nil { + return err + } + return nil +} + +func SetDate(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + metas := htmldoc.MetasByProperty(p.DOC, "article:modified_time") + cs := []string{} + for _, meta := range metas { + for _, a := range meta.Attr { + if a.Key == "content" { + cs = append(cs, a.Val) + } + } + } + if len(cs) <= 0 { + return fmt.Errorf("dwnews SetData got nothing.") + } + p.Date = cs[0] + return nil +} + +func SetTitle(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + n := htmldoc.ElementsByTag(p.DOC, "title") + if n == nil { + return fmt.Errorf("[-] there is no element <title>") + } + title := n[0].FirstChild.Data + title = strings.ReplaceAll(title, " | 联合早报网", "") + title = strings.ReplaceAll(title, " | 早报", "") + title = strings.TrimSpace(title) + gears.ReplaceIllegalChar(&title) + p.Title = title + return nil +} + +func SetBody(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + b, err := Zaobao(p) + if err != nil { + return err + } + t, err := time.Parse(time.RFC3339, p.Date) + if err != nil { + return err + } + h1 := fmt.Sprintf("# [%02d.%02d][%02d%02dH] %s", t.Month(), t.Day(), t.Hour(), t.Minute(), p.Title) + p.Body = h1 + "\n\n" + b + "\n\n原地址:" + p.URL.String() + return nil +} + +func Zaobao(p *Post) (string, error) { + if p.DOC == nil { + return "", fmt.Errorf("[-] p.DOC is nil") + } + doc := p.DOC + body := "" + // Fetch content nodes + nodes := htmldoc.ElementsByTagAndClass(doc, "div", "article-content-container") + if len(nodes) == 0 { + nodes = htmldoc.ElementsByTagAndClass(doc, "div", "article-content-rawhtml") + } + if len(nodes) == 0 { + return "", errors.New("[-] There is no tag named `<article>` from: " + p.URL.String()) + } + plist := htmldoc.ElementsByTag(nodes[0], "p") + for _, v := range plist { + if v.FirstChild == nil { + continue + } else if v.FirstChild.FirstChild != nil && + v.FirstChild.Data == "strong" { + a := htmldoc.ElementsByTag(v, "span") + for _, aa := range a { + body += aa.FirstChild.Data + } + body += " \n" + } else { + body += v.FirstChild.Data + " \n" + } + } + body = strings.ReplaceAll(body, "span \n", "") + return body, nil +} diff --git a/internal/fetcher/sites/zaobao/zaobao_test.go b/internal/fetcher/sites/zaobao/zaobao_test.go new file mode 100644 index 0000000..4ae06ca --- /dev/null +++ b/internal/fetcher/sites/zaobao/zaobao_test.go @@ -0,0 +1,77 @@ +package zaobao + +import ( + "fmt" + "log" + "net/url" + "testing" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" +) + +var p = PostFactory("https://www.zaobao.com/news/world/story20200830-1080786") + +func PostFactory(rawurl string) *Post { + url, err := url.Parse(rawurl) + if err != nil { + log.Printf("url parse err: %s", err) + } + return &Post{ + Domain: url.Hostname(), + URL: url, + } +} + +func TestSetDate(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := SetDate(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + want := "2020-08-30T07:48:25+08:00" + if p.Date != want { + t.Errorf("\ngot: %v\nwant: %v", p.Date, want) + } +} + +func TestSetTitle(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := SetTitle(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + want := "国际特稿:美国副总统候选人 哈里斯魅力多元" + if p.Title != want { + t.Errorf("\ngot: %v\nwant: %v", p.Title, want) + } +} + +func TestZaobao(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + tc, err := Zaobao(p) + fmt.Println(tc) +} + +func TestSetPost(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := SetPost(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + fmt.Println(p.Title) + fmt.Println(p.Body) +} diff --git a/internal/htmldoc/htmldoc.go b/internal/htmldoc/htmldoc.go index 0472c89..1262263 100644 --- a/internal/htmldoc/htmldoc.go +++ b/internal/htmldoc/htmldoc.go @@ -84,6 +84,51 @@ func ExtractLinks(weburl string) ([]string, error) { return links, nil } +func DivWithAttr(doc *html.Node, attrName, attrValue string) []*html.Node { + var nodes []*html.Node + if attrName == "" || attrValue == "" || doc == nil { + return nil + } + if doc.Type == html.ElementNode { + if "div" == doc.Data { + for _, a := range doc.Attr { + if a.Key == attrName && a.Val == attrValue { + nodes = append(nodes, doc) + } + } + } + } + for c := doc.FirstChild; c != nil; c = c.NextSibling { + nodes = append(nodes, DivWithAttr(c, attrName, attrValue)...) + } + return nodes +} + +func DivWithAttr2(raw []byte, attrName, attrValue string) []byte { + if attrName == "" || attrValue == "" || raw == nil { + return nil + } + z := html.NewTokenizer(bytes.NewReader(raw)) + for { + tt := z.Next() + t := z.Token() + if err := z.Err(); err != nil && err == io.EOF { + break + } + switch tt { + case html.StartTagToken: + if "div" == t.Data { + for _, a := range t.Attr { + if a.Key == attrName && a.Val == attrValue { + return z.Buffered() + } + } + } + } + } + return nil +} + func ElementsNext(doc *html.Node) []*html.Node { nodes := []*html.Node{} if doc == nil { @@ -232,6 +277,7 @@ func ElementsByTagAndId2(raw []byte, tag, id string) []byte { } return nil } + func ElementsByTagAndType(doc *html.Node, tag, attrType string) []*html.Node { var nodes []*html.Node if tag == "" || attrType == "" || doc == nil { @@ -271,16 +317,67 @@ func ElementsNextByTag(doc *html.Node, tag string) []*html.Node { return nodes } -func MetasByName(doc *html.Node, name ...string) []*html.Node { +// MetasByName focus on `<meta name="dateModified" content="2020/09/29 11:27" />` +func MetasByName(doc *html.Node, values ...string) []*html.Node { var nodes []*html.Node - if doc == nil || name == nil { + if doc == nil || values == nil { return nil } if doc.Type == html.ElementNode { if doc.Data == "meta" { for _, a := range doc.Attr { if a.Key == "name" { - for _, v := range name { + for _, v := range values { + if v == a.Val { + nodes = append(nodes, doc) + } + } + } + } + } + } + for c := doc.FirstChild; c != nil; c = c.NextSibling { + nodes = append(nodes, MetasByName(c, values...)...) + } + return nodes +} + +// MetasByItemprop focus on `<meta itemprop="dateModified" content="2020/09/29 11:27" />` +func MetasByItemprop(doc *html.Node, values ...string) []*html.Node { + var nodes []*html.Node + if doc == nil || values == nil { + return nil + } + if doc.Type == html.ElementNode { + if doc.Data == "meta" { + for _, a := range doc.Attr { + if a.Key == "itemprop" { + for _, v := range values { + if v == a.Val { + nodes = append(nodes, doc) + } + } + } + } + } + } + for c := doc.FirstChild; c != nil; c = c.NextSibling { + nodes = append(nodes, MetasByItemprop(c, values...)...) + } + return nodes +} + +// MetasByProperty focus on `<meta property="dateModified" content="2020/09/29 11:27" />` +func MetasByProperty(doc *html.Node, values ...string) []*html.Node { + var nodes []*html.Node + if doc == nil || values == nil { + return nil + } + if doc.Type == html.ElementNode { + if doc.Data == "meta" { + for _, a := range doc.Attr { + if a.Key == "property" { + for _, v := range values { if v == a.Val { nodes = append(nodes, doc) } @@ -290,7 +387,7 @@ func MetasByName(doc *html.Node, name ...string) []*html.Node { } } for c := doc.FirstChild; c != nil; c = c.NextSibling { - nodes = append(nodes, MetasByName(c, name...)...) + nodes = append(nodes, MetasByProperty(c, values...)...) } return nodes } diff --git a/internal/htmldoc/htmldoc_test.go b/internal/htmldoc/htmldoc_test.go index 81cb618..3366618 100644 --- a/internal/htmldoc/htmldoc_test.go +++ b/internal/htmldoc/htmldoc_test.go @@ -12,8 +12,9 @@ import ( "golang.org/x/net/html" ) -func TestElementsByTagAndClass(t *testing.T) { - u, err := url.Parse("https://www.rfa.org/mandarin/yataibaodao/junshiwaijiao/jt-07022020105416.html") +var u, err = url.Parse("https://news.ltn.com.tw/news/world/breakingnews/3277899") + +func TestDivWithAttr(t *testing.T) { if err != nil { t.Errorf("url Parse err: %v", err) } @@ -21,14 +22,42 @@ func TestElementsByTagAndClass(t *testing.T) { if err != nil { t.Errorf("GetRawAndDoc err: %v", err) } - tc := ElementsByTagAndClass(doc, "div", "wsw") + tc := DivWithAttr(doc, "data-desc", "內容頁") plist := ElementsByTag(tc[0], "p") for _, v := range plist { fmt.Println(v.FirstChild.Data) } } +func TestDivWithAttr2(t *testing.T) { + if err != nil { + t.Errorf("url Parse err: %v", err) + } + raw, _, err := GetRawAndDoc(u, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + tc := DivWithAttr2(raw, "data-desc", "內容頁") + fmt.Println(string(tc)) +} + +func TestElementsByTagAndClass(t *testing.T) { + s, err := ioutil.ReadFile("./test.html") + if err != nil { + t.Errorf("read file err: %v", err) + } + doc, err := html.Parse(bytes.NewReader(s)) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + tc := ElementsByTagAndClass(doc, "div", "paragraph") + a := ElementsByTag(tc[0], "h2", "p") + for _, v := range a { + if v.FirstChild != nil { + fmt.Println(v.FirstChild.Data) + } + } +} func TestElementsByTagAndClass2(t *testing.T) { - u, err := url.Parse("https://www.rfa.org/mandarin/yataibaodao/junshiwaijiao/jt-07022020105416.html") if err != nil { t.Errorf("url Parse err: %v", err) } @@ -41,7 +70,6 @@ func TestElementsByTagAndClass2(t *testing.T) { } func TestElementsByTagAndId(t *testing.T) { - u, err := url.Parse("https://www.rfa.org/mandarin/yataibaodao/junshiwaijiao/jt-07022020105416.html") if err != nil { t.Errorf("url Parse err: %v", err) } @@ -68,8 +96,56 @@ func TestElementsByTagAndId(t *testing.T) { } } +func TestMetaByProperty(t *testing.T) { + if err != nil { + t.Errorf("url Parse err: %v", err) + } + _, doc, err := GetRawAndDoc(u, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + tc := MetasByProperty(doc, "article:modified_time") + rt := []string{} + for _, n := range tc { + for _, a := range n.Attr { + if a.Key == "content" { + rt = append(rt, a.Val) + } + } + } + want := "2020-08-25T09:42:32+08:00" + if want != rt[0] { + t.Errorf("want: %v, got: %v", want, rt[0]) + } + fmt.Println(rt[0]) +} + +func TestMetaByItemprop(t *testing.T) { + u, err = url.Parse("https://www.cna.com.tw/news/aopl/202009290075.aspx") + if err != nil { + t.Errorf("url Parse err: %v", err) + } + _, doc, err := GetRawAndDoc(u, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + tc := MetasByItemprop(doc, "dateModified") + rt := []string{} + for _, n := range tc { + for _, a := range n.Attr { + if a.Key == "content" { + rt = append(rt, a.Val) + } + } + } + want := "2020/09/29 11:49" + if want != rt[0] { + t.Errorf("want: %v, got: %v", want, rt[0]) + } + fmt.Println(rt[0]) +} + func TestMetaByName(t *testing.T) { - u, err := url.Parse("https://www.dwnews.com/全球/60203304") if err != nil { t.Errorf("url Parse err: %v", err) } diff --git a/internal/htmldoc/test.html b/internal/htmldoc/test.html index 55804a0..f2e347c 100644 --- a/internal/htmldoc/test.html +++ b/internal/htmldoc/test.html @@ -15,5 +15,69 @@ <br> test text </p> + <div class="paragraph"> + <div class="media"> + <div class="myCustom"> + <span style="font-weight: 400;"> + <div class="book-title"> + <div> + 今晨最新 + </div> + </div> + <div class="book" style="line-height: 1.8"> + <div class="book-content"> + <ul class="book-list"> + <li><a class="moreArticle-link" href="https://www.cna.com.tw/news/firstnews/202009290004.aspx">市長聯盟致歉:台灣六都會籍因技術問題誤分類</a></li> + <li><a class="moreArticle-link" href="https://www.cna.com.tw/news/firstnews/202009290002.aspx">解放軍4海域同時軍演 升高對台美施壓</a></li> + <li><a class="moreArticle-link" href="https://www.cna.com.tw/news/firstnews/202009290009.aspx">美股續漲 道瓊勁揚逾410點</a></li> + </ul> + </div> + </div></span> + </div> + </div> + <h2>安心旅遊補助續辦至10月底 中秋雙十連假可用</h2> + <p>交通部長林佳龍28日向行政院長蘇貞昌報告安心旅遊成效,蘇貞昌支持交通部觀光局安心旅遊補助續辦到10月底,預計中秋及雙十連假出遊民眾,仍可享用這項補助。 (<a href="https://www.cna.com.tw/news/firstnews/202009285009.aspx" class="早安世界延伸1">看完整報導</a>)</p> + <h2>8月景氣燈號轉綠燈 國發會:經濟漸趨回穩</h2> + <p>國發會報喜,28日宣布8月景氣燈號脫離黃藍燈、轉為代表穩定的綠燈,綜合判斷分數也較上月大增5分、升至26分,國發會認為,景氣從低點反轉向上的跡象愈來愈明顯,台灣經濟漸趨回穩。 (<a href="https://www.cna.com.tw/news/firstnews/202009285007.aspx" class="早安世界延伸2">看完整報導</a>)</p> + <h2>武漢肺炎死亡人數破百萬 中國豪賭疫苗效力</h2> + <p>武漢肺炎從中國爆發並蔓延全球迄今不到一年,迄今已逾百萬人喪命。世衛警告,若不採取更多集體行動,病故人數恐倍增。另一方面,中國正大範圍為特定人員接種<a href="https://www.cna.com.tw/news/firstnews/202009280182.aspx" class="早安世界延伸3">2019冠狀病毒疾病疫苗</a>,紐時報導,沒有其他國家在常規藥物試驗程序之外,以如此之大的規模給人們注射未經檢驗的疫苗。中國這種急切的做法相當於一場豪賭。(<a href="https://www.cna.com.tw/news/firstnews/202009280009.aspx" class="早安世界延伸4">看完整報導</a>)</p> + <div class="media"> + <div class="myCustom"> + <span style="font-weight: 400;"> + <div class="dashedline"></div></span> + </div> + </div> + <h2>亞美尼亞、亞塞拜然交戰 30年夙怨重新點燃</h2> + <p>高加索地區的亞塞拜然與亞美尼亞28日進入交戰第2日,已導致數十人喪生,雙方為這場戰鬥指責彼此,世界領袖已促請雙方冷靜,各界擔心這場戰鬥引發全面衝突,可能捲入區域大國俄羅斯和土耳其。「雙亞」夙怨糾結30年,因主張脫離亞塞拜然的納哥諾卡拉巴克地區迭起紛爭。 (<a href="https://www.cna.com.tw/news/firstnews/202009280159.aspx" class="早安世界延伸5">看完整報導</a>)</p> + <h2>TikTok暫逃封殺命運 聯邦法院擋下川普政府禁令</h2> + <p>川普政府原訂28日起禁止美國用戶下載熱門短影音分享應用程式TikTok,聯邦法官27日在最後一刻做出裁決,暫緩實施這項具有政治色彩的禁令,讓TikTok暫時逃過一劫。 (<a href="https://www.cna.com.tw/news/firstnews/202009280016.aspx" class="早安世界延伸6">看完整報導</a>)</p> + <h2>陽光普照代表台灣 角逐奧斯卡最佳國際影片</h2> + <p>導演鍾孟宏執導的電影「陽光普照」,將代表台灣角逐美國第93屆奧斯卡最佳國際影片獎。文化部28日表示,2020年共有18部國片報名,經過甄選後推薦由「陽光普照」代表台灣參賽,甄選委員認為本片「親子議題刻劃深刻,觸動人心,製作品質領先群倫,備受影展肯定,國際能見度佳。」(<a style="background-color: #ffffff;" href="https://www.cna.com.tw/news/firstnews/202009280216.aspx" class="早安世界延伸7">看完整報導</a>)</p> + <div class="media"> + <div class="myCustom"> + <span style="font-weight: 400;"> + <div class="dashedline"></div></span> + </div> + </div> + <h2>菲律賓籍移工檢疫期滿確診武漢肺炎 曾聚餐趴趴走一天半</h2> + <p>一名菲律賓移工檢疫期滿採檢,確診武漢肺炎。但在採檢結果出爐前,個案已進到社區活動約1.5天時間。疫情指揮中心28日表示,現已改為檢疫期滿前採檢,未來不會發生類似狀況。 (<a href="https://www.cna.com.tw/news/ahel/202009280170.aspx" class="早安世界延伸8">看完整報導</a>)</p> + <h2>台藝人中國國慶唱我的祖國若確認違法 最重罰50萬</h2> + <p>藝人歐陽娜娜、張韶涵傳將在中國國慶晚會獻唱「我的祖國」等歌曲,文化部28日表示,如果大陸委員會確認違法,最重可處新台幣50萬元罰鍰。此外,阿美族人楊品驊在海峽論壇上<a href="https://www.cna.com.tw/news/aipl/202009280241.aspx" class="早安世界延伸9">自稱是中國人</a>,原民會主委夷將.拔路兒28日表示,不反對與中國大陸互動,但必須認清自己是中華民國國民,不是中國人;不能忍受用個人的名義代表整個族群。 (<a href="https://www.cna.com.tw/news/firstnews/202009280099.aspx" class="早安世界延伸10">看完整報導</a>)</p> + <div class="media"> + <div class="myCustom"> + <span style="font-weight: 400;"> + <div class="dashedline"></div></span> + </div> + </div> + <h2>川普超會抵稅 當選那年所得稅僅繳750美元</h2> + <p>美國總統大選首場電視辯論即將登場之際,川普的財務紀錄又掀爭議。「紐約時報」27日爆料,川普在贏得大選的2016年,只繳了750美元(約新台幣2萬2000元)聯邦所得稅。川普對紐時的報導不屑一顧,他說自己「付了很多錢,也繳了很多的州所得稅」。 (<a href="https://www.cna.com.tw/news/firstnews/202009280129.aspx" class="早安世界延伸11">看完整報導</a>)</p> + <h2>中國審查制度下 編輯嘆出版太難</h2> + <p>法國經濟學家皮凱提新書無法在中國出版,只是反映出中國審查制度的冰山一角。本地編輯表示,現在有多重出版禁忌,且有時不知道紅線劃在哪裡,出好書越來越難,因為審核制度越來越嚴格,大家選書趨於保守。(<a href="https://www.cna.com.tw/news/acn/202009280323.aspx" class="早安世界延伸12">看完整報導</a>)</p> + <div class="media"> + <div class="myCustom"> + <span style="font-weight: 400;"><p style="background-color: #F6F6F6; padding:30px; text-align:left">上午8點同步發送電子報!快來<a href="https://cna.us19.list-manage.com/subscribe?u=1c186687e418733737656ad4c&id=3a8cd69d5d">訂閱「早安世界」</a>給你最精華的新聞摘要。<br />若有任何建議請來信告訴我們,想獲得更多最新資訊快來<a href="https://www.facebook.com/cnanewstaiwan/?epa=SEARCH_BOX">和中央社做朋友</a>。<br />歡迎訂閱<a href="https://cna.us19.list-manage.com/subscribe?u=1c186687e418733737656ad4c&id=ecac484a79">中央社國際新聞電子報</a>,每週三、日發報,掌握世界脈動。</p></span> + </div> + </div> + </div> </body> </html> diff --git a/main.go b/main.go index 0fc8bb4..aa81256 100644 --- a/main.go +++ b/main.go @@ -11,6 +11,13 @@ import ( func main() { year := strconv.Itoa(time.Now().Year()) sites := []string{ + // expand fetch range need update cna.go function: setTitle + "https://www.cna.com.tw/list/aopl.aspx", // 国际 + "https://news.ltn.com.tw/list/breakingnews/world", // 国际 + "https://www.zaobao.com/realtime/world", + "https://www.zaobao.com/news/world", + "https://www.zaobao.com/realtime/china", + "https://www.zaobao.com/news/china", "https://www.dwnews.com", "https://www.dwnews.com/issue/10062", "https://www.dwnews.com/zone/10000117",