From 7d56503c7d21c7de47b4ce5c875860ac81ae8143 Mon Sep 17 00:00:00 2001 From: wedojava Date: Thu, 23 Jul 2020 18:07:11 +0800 Subject: [PATCH 01/32] dev pause --- internal/fetcher/sites/ftchinese/ftchinese.go | 137 ++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 internal/fetcher/sites/ftchinese/ftchinese.go diff --git a/internal/fetcher/sites/ftchinese/ftchinese.go b/internal/fetcher/sites/ftchinese/ftchinese.go new file mode 100644 index 0000000..c24279c --- /dev/null +++ b/internal/fetcher/sites/ftchinese/ftchinese.go @@ -0,0 +1,137 @@ +package ftchinese + +import ( + "errors" + "fmt" + "net/url" + "strings" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" + "github.com/wedojava/gears" + "golang.org/x/net/html" +) + +type Post struct { + Domain string + URL *url.URL + DOC *html.Node + Raw []byte + Title string + Body string + Date string + Filename string +} + +func SetPost(p *Post) error { + if err := SetDate(p); err != nil { + return err + } + if err := SetTitle(p); err != nil { + return err + } + if err := SetBody(p); err != nil { + return err + } + return nil +} + +func SetDate(p *Post) error { + metas := htmldoc.MetasByName(p.DOC, "parsely-pub-date") + cs := []string{} + for _, meta := range metas { + for _, a := range meta.Attr { + if a.Key == "content" { + cs = append(cs, a.Val) + } + } + } + if len(cs) <= 0 { + return fmt.Errorf("dwnews SetData got nothing.") + } + p.Date = cs[0] + return nil +} + +func SetTitle(p *Post) error { + n := htmldoc.ElementsByTag(p.DOC, "title") + if n == nil { + return fmt.Errorf("[-] there is no element ") + } + title := n[0].FirstChild.Data + if strings.Contains(title, "[图集]") { + return fmt.Errorf("[!] Picture news ignored.") + } + title = strings.ReplaceAll(title, "|多维新闻", "") + title = strings.TrimSpace(title) + gears.ReplaceIllegalChar(&title) + p.Title = title + return nil +} + +func SetBody(p *Post) error { + if p.DOC == nil { + return errors.New("[-] there is no DOC object to get and format.") + } + b, err := Dwnews(p) + if err != nil { + return err + } + t, err := time.Parse(time.RFC3339, p.Date) + if err != nil { + return err + } + h1 := fmt.Sprintf("# [%02d.%02d][%02d%02dH] %s", t.Month(), t.Day(), t.Hour(), t.Minute(), p.Title) + p.Body = h1 + "\n\n" + b + "\n\n原地址:" + p.URL.String() + return nil +} + +func Dwnews(p *Post) (string, error) { + doc := p.DOC + body := "" + // Fetch content nodes + nodes := htmldoc.ElementsByTag(doc, "article") + if len(nodes) == 0 { + return "", errors.New("[-] There is no tag named `<article>` from: " + p.URL.String()) + } + articleDoc := nodes[0].FirstChild + plist := htmldoc.ElementsByTag(articleDoc, "p") + if articleDoc.FirstChild.Data == "div" { // to fetch the summary block + // body += fmt.Sprintf("\n > %s \n", plist[0].FirstChild.Data) // redundant summary + body += fmt.Sprintf("\n > ") + } + for _, v := range plist { // the last item is `推荐阅读:` + if v.FirstChild == nil { + continue + } else if v.FirstChild.FirstChild != nil && v.FirstChild.Data == "strong" { + if d := v.FirstChild.FirstChild.Data; !strings.Contains(d, "↓↓↓") || + !strings.Contains(d, "点击浏览") { + body += fmt.Sprintf("\n** %s ** \n", d) + } + if t := v.FirstChild.NextSibling; t != nil && t.Type == html.TextNode { + body += t.Data + } + } else { + ok := true + + for _, a := range v.Parent.Attr { + if a.Key == "class" { + switch a.Val { + // if it is a info for picture, igonre! + case "sc-bdVaJa iHZvIS": + ok = false + // if it is a twitter content, ignore! + case "twitter-tweet": + ok = false + } + } + } + if ok { + body += v.FirstChild.Data + " \n" + } + } + } + body = strings.ReplaceAll(body, "strong", "") + body = strings.ReplaceAll(body, "** 推荐阅读: **", "") + return body, nil +} From f1b844e8248360533ac5241e26c5cd64e9ddbf19 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Thu, 23 Jul 2020 23:39:57 +0800 Subject: [PATCH 02/32] add defer+recover to treat panic error --- internal/fetcher/fetcher.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/internal/fetcher/fetcher.go b/internal/fetcher/fetcher.go index 1d0074a..3e19f91 100644 --- a/internal/fetcher/fetcher.go +++ b/internal/fetcher/fetcher.go @@ -38,6 +38,13 @@ func BreadthFirst(f func(item string), worklist []string) { } func Crawl(_url string) { + defer func() { + if err := recover(); err != nil { + e := err.(error) + log.Println(e) + PanicLog(e) + } + }() f := FetcherFactory(_url) log.Printf("[*] Deal with: [%s]\n", _url) log.Println("[*] Fetch links ...") @@ -102,6 +109,19 @@ func DelRoutine(folder string, n int) error { return nil } +func PanicLog(_err error) error { + filePath := "./PanicLog.txt" + file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) + if err != nil { + return err + } + defer file.Close() + write := bufio.NewWriter(file) + write.WriteString("[" + time.Now().Format(time.RFC3339) + "]--------------------------------------\n") + write.WriteString(_err.Error() + "\n") + write.Flush() + return nil +} func ErrLog(msg string) error { filePath := "./errLog.txt" file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) From a292ed81d509b7de9b151e722a8dde51a585efd3 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Tue, 25 Aug 2020 19:21:26 +0800 Subject: [PATCH 03/32] fetch links for zaobao.com --- internal/fetcher/links.go | 2 ++ internal/fetcher/links_test.go | 6 ++---- main.go | 2 ++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/internal/fetcher/links.go b/internal/fetcher/links.go index 11b2106..92e1a26 100644 --- a/internal/fetcher/links.go +++ b/internal/fetcher/links.go @@ -34,6 +34,8 @@ func (f *Fetcher) SetLinks() error { case "www.rfa.org": f.Links = LinksFilter(links, `.*?/.*?-\d*.html`) KickOutLinksMatchPath(&f.Links, "about") + case "www.zaobao.com": + f.Links = LinksFilter(links, `.*?/world/.*`) } return nil } diff --git a/internal/fetcher/links_test.go b/internal/fetcher/links_test.go index 74634cd..73b0417 100644 --- a/internal/fetcher/links_test.go +++ b/internal/fetcher/links_test.go @@ -18,17 +18,15 @@ func TestKickOutLinksMatchPath(t *testing.T) { } func TestSetLinks(t *testing.T) { - u, err := url.Parse("https://www.voachinese.com") + u, err := url.Parse("https://www.zaobao.com/realtime/world") if err != nil { t.Errorf("Url Parse fail!\n%s", err) } var f = &Fetcher{ Entrance: u, - // Entrance: "https://www.voachinese.com", } f.SetLinks() - // assertLink := "https://www.voachinese.com/a/who-remains-tight-lipped-experts-sent-investigate-coronavirus-china-20200713/5500866.html" - assertLink := "https://www.voachinese.com/a/fire-still-raging-aboard-navy-ship-docked-in-california-20200713/5500960.html" + assertLink := "https://www.zaobao.com/realtime/world/story20200825-1079575" shot := 0 for _, link := range f.Links { fmt.Println(link) diff --git a/main.go b/main.go index 0fc8bb4..65e555e 100644 --- a/main.go +++ b/main.go @@ -11,6 +11,8 @@ import ( func main() { year := strconv.Itoa(time.Now().Year()) sites := []string{ + "https://www.zaobao.com/realtime/world", + "https://www.zaobao.com/news/world", "https://www.dwnews.com", "https://www.dwnews.com/issue/10062", "https://www.dwnews.com/zone/10000117", From 7ec36d477b3ba3fe02e206794c1a12f9ddf9ec9b Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Tue, 25 Aug 2020 20:49:32 +0800 Subject: [PATCH 04/32] add zaobao, title body get done. --- internal/fetcher/sites/zaobao/zaobao.go | 115 +++++++++++++++++++ internal/fetcher/sites/zaobao/zaobao_test.go | 63 ++++++++++ 2 files changed, 178 insertions(+) create mode 100644 internal/fetcher/sites/zaobao/zaobao.go create mode 100644 internal/fetcher/sites/zaobao/zaobao_test.go diff --git a/internal/fetcher/sites/zaobao/zaobao.go b/internal/fetcher/sites/zaobao/zaobao.go new file mode 100644 index 0000000..6ac59e4 --- /dev/null +++ b/internal/fetcher/sites/zaobao/zaobao.go @@ -0,0 +1,115 @@ +package dwnews + +import ( + "errors" + "fmt" + "net/url" + "strings" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" + "github.com/wedojava/gears" + "golang.org/x/net/html" +) + +type Post struct { + Domain string + URL *url.URL + DOC *html.Node + Raw []byte + Title string + Body string + Date string + Filename string +} + +func SetPost(p *Post) error { + if err := SetDate(p); err != nil { + return err + } + if err := SetTitle(p); err != nil { + return err + } + if err := SetBody(p); err != nil { + return err + } + return nil +} + +func SetDate(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + metas := htmldoc.MetasByName(p.DOC, "parsely-pub-date") + cs := []string{} + for _, meta := range metas { + for _, a := range meta.Attr { + if a.Key == "content" { + cs = append(cs, a.Val) + } + } + } + if len(cs) <= 0 { + return fmt.Errorf("dwnews SetData got nothing.") + } + p.Date = cs[0] + return nil +} + +func SetTitle(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + n := htmldoc.ElementsByTag(p.DOC, "title") + if n == nil { + return fmt.Errorf("[-] there is no element <title>") + } + title := n[0].FirstChild.Data + title = strings.ReplaceAll(title, " | 联合早报网", "") + title = strings.TrimSpace(title) + gears.ReplaceIllegalChar(&title) + p.Title = title + return nil +} + +func SetBody(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + b, err := Zaobao(p) + if err != nil { + return err + } + t, err := time.Parse(time.RFC3339, p.Date) + if err != nil { + return err + } + h1 := fmt.Sprintf("# [%02d.%02d][%02d%02dH] %s", t.Month(), t.Day(), t.Hour(), t.Minute(), p.Title) + p.Body = h1 + "\n\n" + b + "\n\n原地址:" + p.URL.String() + return nil +} + +func Zaobao(p *Post) (string, error) { + if p.DOC == nil { + return "", fmt.Errorf("[-] p.DOC is nil") + } + doc := p.DOC + body := "" + // Fetch content nodes + nodes := htmldoc.ElementsByTagAndClass(doc, "div", "article-content-container") + if len(nodes) == 0 { + nodes = htmldoc.ElementsByTagAndClass(doc, "div", "article-content-rawhtml") + } + if len(nodes) == 0 { + return "", errors.New("[-] There is no tag named `<article>` from: " + p.URL.String()) + } + plist := htmldoc.ElementsByTag(nodes[0], "p") + for _, v := range plist { // the last item is `推荐阅读:` + if v.FirstChild == nil { + continue + } else { + body += v.FirstChild.Data + " \n" + } + } + return body, nil +} diff --git a/internal/fetcher/sites/zaobao/zaobao_test.go b/internal/fetcher/sites/zaobao/zaobao_test.go new file mode 100644 index 0000000..0d6c11c --- /dev/null +++ b/internal/fetcher/sites/zaobao/zaobao_test.go @@ -0,0 +1,63 @@ +package dwnews + +import ( + "fmt" + "log" + "net/url" + "testing" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" +) + +func PostFactory(rawurl string) *Post { + url, err := url.Parse(rawurl) + if err != nil { + log.Printf("url parse err: %s", err) + } + return &Post{ + Domain: url.Hostname(), + URL: url, + } +} + +func TestSetTitle(t *testing.T) { + p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := SetTitle(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + want := "韩首都圈学校全面线上上课两周" + if p.Title != want { + t.Errorf("\ngot: %v\nwant: %v", p.Title, want) + } +} + +func TestZaobao(t *testing.T) { + p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + tc, err := Zaobao(p) + fmt.Println(tc) +} + +func TestSetPost(t *testing.T) { + p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := SetPost(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + fmt.Println(p.Title) + fmt.Println(p.Body) +} From c140227b4701a988c4d4a3f0a92e949fa0ed41e4 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Tue, 25 Aug 2020 20:50:27 +0800 Subject: [PATCH 05/32] fix panic --- internal/fetcher/sites/dwnews/dwnews.go | 3 +- internal/fetcher/sites/ftchinese/ftchinese.go | 137 ------------------ 2 files changed, 2 insertions(+), 138 deletions(-) delete mode 100644 internal/fetcher/sites/ftchinese/ftchinese.go diff --git a/internal/fetcher/sites/dwnews/dwnews.go b/internal/fetcher/sites/dwnews/dwnews.go index 4157594..c33e941 100644 --- a/internal/fetcher/sites/dwnews/dwnews.go +++ b/internal/fetcher/sites/dwnews/dwnews.go @@ -105,7 +105,8 @@ func Dwnews(p *Post) (string, error) { } articleDoc := nodes[0].FirstChild plist := htmldoc.ElementsByTag(articleDoc, "p") - if articleDoc.FirstChild.Data == "div" { // to fetch the summary block + if articleDoc.FirstChild != nil && + articleDoc.FirstChild.Data == "div" { // to fetch the summary block // body += fmt.Sprintf("\n > %s \n", plist[0].FirstChild.Data) // redundant summary body += fmt.Sprintf("\n > ") } diff --git a/internal/fetcher/sites/ftchinese/ftchinese.go b/internal/fetcher/sites/ftchinese/ftchinese.go deleted file mode 100644 index c24279c..0000000 --- a/internal/fetcher/sites/ftchinese/ftchinese.go +++ /dev/null @@ -1,137 +0,0 @@ -package ftchinese - -import ( - "errors" - "fmt" - "net/url" - "strings" - "time" - - "github.com/wedojava/fetcher/internal/htmldoc" - "github.com/wedojava/gears" - "golang.org/x/net/html" -) - -type Post struct { - Domain string - URL *url.URL - DOC *html.Node - Raw []byte - Title string - Body string - Date string - Filename string -} - -func SetPost(p *Post) error { - if err := SetDate(p); err != nil { - return err - } - if err := SetTitle(p); err != nil { - return err - } - if err := SetBody(p); err != nil { - return err - } - return nil -} - -func SetDate(p *Post) error { - metas := htmldoc.MetasByName(p.DOC, "parsely-pub-date") - cs := []string{} - for _, meta := range metas { - for _, a := range meta.Attr { - if a.Key == "content" { - cs = append(cs, a.Val) - } - } - } - if len(cs) <= 0 { - return fmt.Errorf("dwnews SetData got nothing.") - } - p.Date = cs[0] - return nil -} - -func SetTitle(p *Post) error { - n := htmldoc.ElementsByTag(p.DOC, "title") - if n == nil { - return fmt.Errorf("[-] there is no element <title>") - } - title := n[0].FirstChild.Data - if strings.Contains(title, "[图集]") { - return fmt.Errorf("[!] Picture news ignored.") - } - title = strings.ReplaceAll(title, "|多维新闻", "") - title = strings.TrimSpace(title) - gears.ReplaceIllegalChar(&title) - p.Title = title - return nil -} - -func SetBody(p *Post) error { - if p.DOC == nil { - return errors.New("[-] there is no DOC object to get and format.") - } - b, err := Dwnews(p) - if err != nil { - return err - } - t, err := time.Parse(time.RFC3339, p.Date) - if err != nil { - return err - } - h1 := fmt.Sprintf("# [%02d.%02d][%02d%02dH] %s", t.Month(), t.Day(), t.Hour(), t.Minute(), p.Title) - p.Body = h1 + "\n\n" + b + "\n\n原地址:" + p.URL.String() - return nil -} - -func Dwnews(p *Post) (string, error) { - doc := p.DOC - body := "" - // Fetch content nodes - nodes := htmldoc.ElementsByTag(doc, "article") - if len(nodes) == 0 { - return "", errors.New("[-] There is no tag named `<article>` from: " + p.URL.String()) - } - articleDoc := nodes[0].FirstChild - plist := htmldoc.ElementsByTag(articleDoc, "p") - if articleDoc.FirstChild.Data == "div" { // to fetch the summary block - // body += fmt.Sprintf("\n > %s \n", plist[0].FirstChild.Data) // redundant summary - body += fmt.Sprintf("\n > ") - } - for _, v := range plist { // the last item is `推荐阅读:` - if v.FirstChild == nil { - continue - } else if v.FirstChild.FirstChild != nil && v.FirstChild.Data == "strong" { - if d := v.FirstChild.FirstChild.Data; !strings.Contains(d, "↓↓↓") || - !strings.Contains(d, "点击浏览") { - body += fmt.Sprintf("\n** %s ** \n", d) - } - if t := v.FirstChild.NextSibling; t != nil && t.Type == html.TextNode { - body += t.Data - } - } else { - ok := true - - for _, a := range v.Parent.Attr { - if a.Key == "class" { - switch a.Val { - // if it is a info for picture, igonre! - case "sc-bdVaJa iHZvIS": - ok = false - // if it is a twitter content, ignore! - case "twitter-tweet": - ok = false - } - } - } - if ok { - body += v.FirstChild.Data + " \n" - } - } - } - body = strings.ReplaceAll(body, "strong", "") - body = strings.ReplaceAll(body, "** 推荐阅读: **", "") - return body, nil -} From 870881c3bb2c6a7366c6776ba62fe49e3d891cf3 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Tue, 25 Aug 2020 20:51:01 +0800 Subject: [PATCH 06/32] fix wrong test case. --- internal/htmldoc/htmldoc_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/htmldoc/htmldoc_test.go b/internal/htmldoc/htmldoc_test.go index 81cb618..2c1dc76 100644 --- a/internal/htmldoc/htmldoc_test.go +++ b/internal/htmldoc/htmldoc_test.go @@ -13,7 +13,7 @@ import ( ) func TestElementsByTagAndClass(t *testing.T) { - u, err := url.Parse("https://www.rfa.org/mandarin/yataibaodao/junshiwaijiao/jt-07022020105416.html") + u, err := url.Parse("https://www.zaobao.com/realtime/world/story20200825-1079575") if err != nil { t.Errorf("url Parse err: %v", err) } @@ -21,7 +21,7 @@ func TestElementsByTagAndClass(t *testing.T) { if err != nil { t.Errorf("GetRawAndDoc err: %v", err) } - tc := ElementsByTagAndClass(doc, "div", "wsw") + tc := ElementsByTagAndClass(doc, "div", "article-content-container") plist := ElementsByTag(tc[0], "p") for _, v := range plist { fmt.Println(v.FirstChild.Data) From 7b6ee34997a43a9ad7ae10da59f3296cd839d2a3 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 26 Aug 2020 01:24:45 +0800 Subject: [PATCH 07/32] add func --- internal/htmldoc/htmldoc.go | 24 ++++++++++++++++++++++++ internal/htmldoc/htmldoc_test.go | 25 +++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/internal/htmldoc/htmldoc.go b/internal/htmldoc/htmldoc.go index 0472c89..422cab2 100644 --- a/internal/htmldoc/htmldoc.go +++ b/internal/htmldoc/htmldoc.go @@ -295,6 +295,30 @@ func MetasByName(doc *html.Node, name ...string) []*html.Node { return nodes } +func MetasByProperty(doc *html.Node, name ...string) []*html.Node { + var nodes []*html.Node + if doc == nil || name == nil { + return nil + } + if doc.Type == html.ElementNode { + if doc.Data == "meta" { + for _, a := range doc.Attr { + if a.Key == "property" { + for _, v := range name { + if v == a.Val { + nodes = append(nodes, doc) + } + } + } + } + } + } + for c := doc.FirstChild; c != nil; c = c.NextSibling { + nodes = append(nodes, MetasByProperty(c, name...)...) + } + return nodes +} + func ForEachNode(n *html.Node, pre, post func(n *html.Node)) { if pre != nil { pre(n) diff --git a/internal/htmldoc/htmldoc_test.go b/internal/htmldoc/htmldoc_test.go index 2c1dc76..596f755 100644 --- a/internal/htmldoc/htmldoc_test.go +++ b/internal/htmldoc/htmldoc_test.go @@ -68,6 +68,31 @@ func TestElementsByTagAndId(t *testing.T) { } } +func TestMetaByProperty(t *testing.T) { + u, err := url.Parse("https://www.zaobao.com/realtime/world/story20200825-1079575") + if err != nil { + t.Errorf("url Parse err: %v", err) + } + _, doc, err := GetRawAndDoc(u, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + tc := MetasByProperty(doc, "article:modified_time") + rt := []string{} + for _, n := range tc { + for _, a := range n.Attr { + if a.Key == "content" { + rt = append(rt, a.Val) + } + } + } + want := "2020-08-25T09:42:32+08:00" + if want != rt[0] { + t.Errorf("want: %v, got: %v", want, rt[0]) + } + fmt.Println(rt[0]) +} + func TestMetaByName(t *testing.T) { u, err := url.Parse("https://www.dwnews.com/全球/60203304") if err != nil { From 237e8da111f1194c0523dbb285f0cb616230ef12 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 26 Aug 2020 01:25:34 +0800 Subject: [PATCH 08/32] add test case url --- internal/fetcher/fetcher_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/fetcher/fetcher_test.go b/internal/fetcher/fetcher_test.go index bc4fbc0..6655974 100644 --- a/internal/fetcher/fetcher_test.go +++ b/internal/fetcher/fetcher_test.go @@ -10,7 +10,9 @@ func TestCrawl(t *testing.T) { for { BreadthFirst(Crawl, []string{ // "https://www.boxun.com/rolling.shtml", - "https://www.dwnews.com", + // "https://www.dwnews.com", + "https://www.zaobao.com/realtime/world", + "https://www.zaobao.com/news/world", // "https://www.voachinese.com", // "https://www.rfa.org/mandarin/", }) From 7440b3db90ce0e961ce8dfb4decdecbca7e14583 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 26 Aug 2020 01:30:13 +0800 Subject: [PATCH 09/32] Errlog no use now. --- internal/fetcher/fetcher.go | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/internal/fetcher/fetcher.go b/internal/fetcher/fetcher.go index 3e19f91..020180a 100644 --- a/internal/fetcher/fetcher.go +++ b/internal/fetcher/fetcher.go @@ -122,16 +122,17 @@ func PanicLog(_err error) error { write.Flush() return nil } + func ErrLog(msg string) error { - filePath := "./errLog.txt" - file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) - if err != nil { - return err - } - defer file.Close() - write := bufio.NewWriter(file) - write.WriteString("[" + time.Now().Format(time.RFC3339) + "]--------------------------------------\n") - write.WriteString(msg + "\n") - write.Flush() + // filePath := "./errLog.txt" + // file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) + // if err != nil { + // return err + // } + // defer file.Close() + // write := bufio.NewWriter(file) + // write.WriteString("[" + time.Now().Format(time.RFC3339) + "]--------------------------------------\n") + // write.WriteString(msg + "\n") + // write.Flush() return nil } From b522317b734dd8b68b4ee7e7cfa8b8c8ec889d19 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 26 Aug 2020 01:35:12 +0800 Subject: [PATCH 10/32] add zaobao --- internal/fetcher/post.go | 7 +++++++ internal/fetcher/post_test.go | 17 ++++++++++------- internal/fetcher/sites/zaobao/zaobao.go | 4 ++-- internal/fetcher/sites/zaobao/zaobao_test.go | 18 +++++++++++++++++- 4 files changed, 36 insertions(+), 10 deletions(-) diff --git a/internal/fetcher/post.go b/internal/fetcher/post.go index 6b8b82f..d427744 100644 --- a/internal/fetcher/post.go +++ b/internal/fetcher/post.go @@ -13,6 +13,7 @@ import ( "github.com/wedojava/fetcher/internal/fetcher/sites/dwnews" "github.com/wedojava/fetcher/internal/fetcher/sites/rfa" "github.com/wedojava/fetcher/internal/fetcher/sites/voachinese" + "github.com/wedojava/fetcher/internal/fetcher/sites/zaobao" "github.com/wedojava/fetcher/internal/htmldoc" "github.com/wedojava/gears" "golang.org/x/net/html" @@ -87,6 +88,12 @@ func (p *Post) TreatPost() error { return err } *p = Post(post) + case "www.zaobao.com": + post := zaobao.Post(*p) + if err := zaobao.SetPost(&post); err != nil { + return err + } + *p = Post(post) } // Save post to file if err := p.SetFilename(); err != nil { diff --git a/internal/fetcher/post_test.go b/internal/fetcher/post_test.go index 01308c2..4ad22b8 100644 --- a/internal/fetcher/post_test.go +++ b/internal/fetcher/post_test.go @@ -10,7 +10,8 @@ import ( func TestSetAndSavePost(t *testing.T) { // p := PostFactory("https://www.dwnews.com/经济/60203253") - p := PostFactory("https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html") // The wrong one + // p := PostFactory("https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html") // The wrong one + p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) if err != nil { t.Errorf("GetRawAndDoC error: %v", err) @@ -25,12 +26,14 @@ func TestSetAndSavePost(t *testing.T) { func TestTreatPost(t *testing.T) { tcs := []string{ - "https://www.boxun.com/news/gb/taiwan/2020/07/202007091815.shtml", - "https://www.dwnews.com/经济/60203253", - "https://www.dwnews.com/全球/60203234", - "https://www.voachinese.com/a/S-Korea-Says-US-Sees-Importance-Of-N-Korea-Talks-Despite-Tension-20200709/5496028.html", - "https://www.rfa.org/mandarin/yataibaodao/shaoshuminzu/gf1-07092020074142.html", - "https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html", + // "https://www.boxun.com/news/gb/taiwan/2020/07/202007091815.shtml", + // "https://www.dwnews.com/经济/60203253", + // "https://www.dwnews.com/全球/60203234", + // "https://www.voachinese.com/a/S-Korea-Says-US-Sees-Importance-Of-N-Korea-Talks-Despite-Tension-20200709/5496028.html", + // "https://www.rfa.org/mandarin/yataibaodao/shaoshuminzu/gf1-07092020074142.html", + // "https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html", + "https://www.zaobao.com/realtime/world/story20200825-1079575", + "https://www.zaobao.com/news/world/story20200825-1079477", } for _, tc := range tcs { p := PostFactory(tc) diff --git a/internal/fetcher/sites/zaobao/zaobao.go b/internal/fetcher/sites/zaobao/zaobao.go index 6ac59e4..ea7fe9b 100644 --- a/internal/fetcher/sites/zaobao/zaobao.go +++ b/internal/fetcher/sites/zaobao/zaobao.go @@ -1,4 +1,4 @@ -package dwnews +package zaobao import ( "errors" @@ -40,7 +40,7 @@ func SetDate(p *Post) error { if p.DOC == nil { return fmt.Errorf("[-] p.DOC is nil") } - metas := htmldoc.MetasByName(p.DOC, "parsely-pub-date") + metas := htmldoc.MetasByProperty(p.DOC, "article:modified_time") cs := []string{} for _, meta := range metas { for _, a := range meta.Attr { diff --git a/internal/fetcher/sites/zaobao/zaobao_test.go b/internal/fetcher/sites/zaobao/zaobao_test.go index 0d6c11c..46bafba 100644 --- a/internal/fetcher/sites/zaobao/zaobao_test.go +++ b/internal/fetcher/sites/zaobao/zaobao_test.go @@ -1,4 +1,4 @@ -package dwnews +package zaobao import ( "fmt" @@ -21,6 +21,22 @@ func PostFactory(rawurl string) *Post { } } +func TestSetDate(t *testing.T) { + p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := SetDate(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + want := "2020-08-25T09:42:32+08:00" + if p.Date != want { + t.Errorf("\ngot: %v\nwant: %v", p.Date, want) + } +} + func TestSetTitle(t *testing.T) { p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) From eb9e16119eb1f4c76f30c2c85a851542f425f281 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 26 Aug 2020 01:52:56 +0800 Subject: [PATCH 11/32] add --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..46c5934 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# Fetcher + +Fetch news from urls. + +# Intro + +Fetch depth: 1 + +# Development Tips + +There files need to be modified while add a new site: +- main.go: add entrance url +- links.go -> SetLinks(): add case about target urls feature regex, eg: if url must have `about`, param 2 is `.*?about.*` +- post.go -> TreatPost(): add case for new site domain. +- site/newsite: copy files from sibling folder, then develop and pass the test. From bf1f5580632a6be1f8b9591f63ea7f8016190810 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 26 Aug 2020 02:05:41 +0800 Subject: [PATCH 12/32] add entrances --- main.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.go b/main.go index 65e555e..0d7b1f3 100644 --- a/main.go +++ b/main.go @@ -13,6 +13,8 @@ func main() { sites := []string{ "https://www.zaobao.com/realtime/world", "https://www.zaobao.com/news/world", + "https://www.zaobao.com/realtime/china", + "https://www.zaobao.com/news/china", "https://www.dwnews.com", "https://www.dwnews.com/issue/10062", "https://www.dwnews.com/zone/10000117", From 89c2441baa717f9d878e31fdb44d1e1df23ba78e Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 26 Aug 2020 02:41:29 +0800 Subject: [PATCH 13/32] extend links range --- internal/fetcher/links.go | 6 +++++- internal/fetcher/links_test.go | 19 ++++++++++++------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/internal/fetcher/links.go b/internal/fetcher/links.go index 92e1a26..2a1a5ec 100644 --- a/internal/fetcher/links.go +++ b/internal/fetcher/links.go @@ -35,7 +35,11 @@ func (f *Fetcher) SetLinks() error { f.Links = LinksFilter(links, `.*?/.*?-\d*.html`) KickOutLinksMatchPath(&f.Links, "about") case "www.zaobao.com": - f.Links = LinksFilter(links, `.*?/world/.*`) + newsWorld := LinksFilter(links, `.*?/news/world/.*`) + newsChina := LinksFilter(links, `.*?/news/china/.*`) + realtimeWorld := LinksFilter(links, `.*?/realtime/world/.*`) + realtimeChina := LinksFilter(links, `.*?/realtime/china/.*`) + f.Links = append(append(append(newsWorld, newsChina...), realtimeWorld...), realtimeChina...) } return nil } diff --git a/internal/fetcher/links_test.go b/internal/fetcher/links_test.go index 73b0417..45128a7 100644 --- a/internal/fetcher/links_test.go +++ b/internal/fetcher/links_test.go @@ -1,7 +1,6 @@ package fetcher import ( - "fmt" "net/url" "testing" ) @@ -18,7 +17,7 @@ func TestKickOutLinksMatchPath(t *testing.T) { } func TestSetLinks(t *testing.T) { - u, err := url.Parse("https://www.zaobao.com/realtime/world") + u, err := url.Parse("https://www.zaobao.com/realtime/") if err != nil { t.Errorf("Url Parse fail!\n%s", err) } @@ -26,15 +25,21 @@ func TestSetLinks(t *testing.T) { Entrance: u, } f.SetLinks() - assertLink := "https://www.zaobao.com/realtime/world/story20200825-1079575" + assertLinks := []string{ + "https://www.zaobao.com/realtime/china/story20200825-1079597", + "https://www.zaobao.com/realtime/world/story20200825-1079575", + "https://www.zaobao.com/news/china/story20200825-1079462", + "https://www.zaobao.com/news/world/story20200825-1079476", + } shot := 0 for _, link := range f.Links { - fmt.Println(link) - if link == assertLink { - shot++ + for _, v := range assertLinks { + if link == v { + shot++ + } } } if shot == 0 { - t.Errorf("want: %v, got: %v", 1, shot) + t.Errorf("want: %v, got: %v", len(assertLinks), shot) } } From 06800b53cf56b0389f8b5a7a0f77cddd619c0c00 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 2 Sep 2020 03:20:13 +0800 Subject: [PATCH 14/32] add ltn --- internal/fetcher/fetcher_test.go | 5 +- internal/fetcher/links.go | 2 + internal/fetcher/links_test.go | 10 +- internal/fetcher/post.go | 15 ++- internal/fetcher/post_test.go | 12 ++- internal/fetcher/sites/ltn/ltn.go | 117 ++++++++++++++++++++++++ internal/fetcher/sites/ltn/ltn_test.go | 77 ++++++++++++++++ internal/fetcher/sites/zaobao/zaobao.go | 1 + internal/htmldoc/htmldoc.go | 46 ++++++++++ internal/htmldoc/htmldoc_test.go | 33 ++++++- main.go | 1 + 11 files changed, 300 insertions(+), 19 deletions(-) create mode 100644 internal/fetcher/sites/ltn/ltn.go create mode 100644 internal/fetcher/sites/ltn/ltn_test.go diff --git a/internal/fetcher/fetcher_test.go b/internal/fetcher/fetcher_test.go index 6655974..1f18ce5 100644 --- a/internal/fetcher/fetcher_test.go +++ b/internal/fetcher/fetcher_test.go @@ -11,10 +11,11 @@ func TestCrawl(t *testing.T) { BreadthFirst(Crawl, []string{ // "https://www.boxun.com/rolling.shtml", // "https://www.dwnews.com", - "https://www.zaobao.com/realtime/world", - "https://www.zaobao.com/news/world", + // "https://www.zaobao.com/realtime/world", + // "https://www.zaobao.com/news/world", // "https://www.voachinese.com", // "https://www.rfa.org/mandarin/", + "https://news.ltn.com.tw/list/breakingnews", }) log.Println("Sleep a sec ...") diff --git a/internal/fetcher/links.go b/internal/fetcher/links.go index 2a1a5ec..7f392d4 100644 --- a/internal/fetcher/links.go +++ b/internal/fetcher/links.go @@ -40,6 +40,8 @@ func (f *Fetcher) SetLinks() error { realtimeWorld := LinksFilter(links, `.*?/realtime/world/.*`) realtimeChina := LinksFilter(links, `.*?/realtime/china/.*`) f.Links = append(append(append(newsWorld, newsChina...), realtimeWorld...), realtimeChina...) + case "news.ltn.com.tw": + f.Links = LinksFilter(links, `https://news.*/news/.*`) } return nil } diff --git a/internal/fetcher/links_test.go b/internal/fetcher/links_test.go index 45128a7..3d18619 100644 --- a/internal/fetcher/links_test.go +++ b/internal/fetcher/links_test.go @@ -17,7 +17,7 @@ func TestKickOutLinksMatchPath(t *testing.T) { } func TestSetLinks(t *testing.T) { - u, err := url.Parse("https://www.zaobao.com/realtime/") + u, err := url.Parse("https://news.ltn.com.tw/list/breakingnews") if err != nil { t.Errorf("Url Parse fail!\n%s", err) } @@ -26,10 +26,10 @@ func TestSetLinks(t *testing.T) { } f.SetLinks() assertLinks := []string{ - "https://www.zaobao.com/realtime/china/story20200825-1079597", - "https://www.zaobao.com/realtime/world/story20200825-1079575", - "https://www.zaobao.com/news/china/story20200825-1079462", - "https://www.zaobao.com/news/world/story20200825-1079476", + "https://news.ltn.com.tw/news/society/breakingnews/3278253", + "https://news.ltn.com.tw/news/society/breakingnews/3278250", + "https://news.ltn.com.tw/news/politics/breakingnews/3278225", + "https://news.ltn.com.tw/news/politics/breakingnews/3278170", } shot := 0 for _, link := range f.Links { diff --git a/internal/fetcher/post.go b/internal/fetcher/post.go index d427744..3c978b0 100644 --- a/internal/fetcher/post.go +++ b/internal/fetcher/post.go @@ -11,6 +11,7 @@ import ( "github.com/wedojava/fetcher/internal/fetcher/sites/boxun" "github.com/wedojava/fetcher/internal/fetcher/sites/dwnews" + "github.com/wedojava/fetcher/internal/fetcher/sites/ltn" "github.com/wedojava/fetcher/internal/fetcher/sites/rfa" "github.com/wedojava/fetcher/internal/fetcher/sites/voachinese" "github.com/wedojava/fetcher/internal/fetcher/sites/zaobao" @@ -94,18 +95,24 @@ func (p *Post) TreatPost() error { return err } *p = Post(post) + case "news.ltn.com.tw": + post := ltn.Post(*p) + if err := ltn.SetPost(&post); err != nil { + return err + } + *p = Post(post) } // Save post to file - if err := p.SetFilename(); err != nil { + if err := p.setFilename(); err != nil { return err } - if err := p.SavePost(); err != nil { + if err := p.savePost(); err != nil { return err } return nil } -func (p *Post) SavePost() error { +func (p *Post) savePost() error { folderPath := filepath.Join("wwwroot", p.Domain) gears.MakeDirAll(folderPath) if p.Filename == "" { @@ -122,7 +129,7 @@ func (p *Post) SavePost() error { return nil } -func (p *Post) SetFilename() error { +func (p *Post) setFilename() error { t, err := time.Parse(time.RFC3339, p.Date) if err != nil { return err diff --git a/internal/fetcher/post_test.go b/internal/fetcher/post_test.go index 4ad22b8..c970add 100644 --- a/internal/fetcher/post_test.go +++ b/internal/fetcher/post_test.go @@ -2,6 +2,7 @@ package fetcher import ( "fmt" + "log" "testing" "time" @@ -32,11 +33,16 @@ func TestTreatPost(t *testing.T) { // "https://www.voachinese.com/a/S-Korea-Says-US-Sees-Importance-Of-N-Korea-Talks-Despite-Tension-20200709/5496028.html", // "https://www.rfa.org/mandarin/yataibaodao/shaoshuminzu/gf1-07092020074142.html", // "https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html", - "https://www.zaobao.com/realtime/world/story20200825-1079575", - "https://www.zaobao.com/news/world/story20200825-1079477", + // "https://www.zaobao.com/realtime/world/story20200825-1079575", + // "https://www.zaobao.com/news/world/story20200825-1079477", + // "https://www.zaobao.com.sg/realtime/world/story20200901-1081441", + "https://ec.ltn.com.tw/article/breakingnews/3277361", } for _, tc := range tcs { p := PostFactory(tc) - p.TreatPost() + err := p.TreatPost() + if err != nil { + log.Println(err) + } } } diff --git a/internal/fetcher/sites/ltn/ltn.go b/internal/fetcher/sites/ltn/ltn.go new file mode 100644 index 0000000..90bee4c --- /dev/null +++ b/internal/fetcher/sites/ltn/ltn.go @@ -0,0 +1,117 @@ +package ltn + +import ( + "bytes" + "fmt" + "net/url" + "regexp" + "strings" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" + "github.com/wedojava/gears" + "golang.org/x/net/html" +) + +type Post struct { + Domain string + URL *url.URL + DOC *html.Node + Raw []byte + Title string + Body string + Date string + Filename string +} + +func SetPost(p *Post) error { + if err := setDate(p); err != nil { + return err + } + if err := setTitle(p); err != nil { + return err + } + if err := setBody(p); err != nil { + return err + } + return nil +} + +func setDate(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + metas := htmldoc.MetasByProperty(p.DOC, "article:published_time") + cs := []string{} + for _, meta := range metas { + for _, a := range meta.Attr { + if a.Key == "content" { + cs = append(cs, a.Val) + } + } + } + if len(cs) <= 0 { + return fmt.Errorf("SetData got nothing.") + } + p.Date = cs[0] + return nil +} + +func setTitle(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + n := htmldoc.ElementsByTag(p.DOC, "title") + if n == nil { + return fmt.Errorf("[-] there is no element <title>") + } + title := n[0].FirstChild.Data + title = strings.ReplaceAll(title, " - 自由時報電子報", "") + title = strings.TrimSpace(title) + gears.ReplaceIllegalChar(&title) + p.Title = title + return nil +} + +func setBody(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + b, err := ltn(p) + if err != nil { + return err + } + t, err := time.Parse(time.RFC3339, p.Date) + if err != nil { + return err + } + h1 := fmt.Sprintf("# [%02d.%02d][%02d%02dH] %s", t.Month(), t.Day(), t.Hour(), t.Minute(), p.Title) + p.Body = h1 + "\n\n" + b + "\n\n原地址:" + p.URL.String() + return nil +} + +func ltn(p *Post) (string, error) { + if p.Raw == nil { + return "", fmt.Errorf("[-] p.Raw is nil") + } + raw := p.Raw + // Fetch content nodes + r := htmldoc.DivWithAttr2(raw, "data-desc", "內容頁") + ps := [][]byte{} + b := bytes.Buffer{} + re := regexp.MustCompile(`<p>(.*?)</p>`) + for _, v := range re.FindAllSubmatch(r, -1) { + ps = append(ps, v[1]) + } + if len(ps) == 0 { + return "", fmt.Errorf("no <p> matched") + } + re = regexp.MustCompile(`<iframe.*?</iframe>`) + for _, p := range ps { + p = re.ReplaceAll(p, []byte("")) + b.Write(p) + b.Write([]byte(" \n")) + } + + return b.String(), nil +} diff --git a/internal/fetcher/sites/ltn/ltn_test.go b/internal/fetcher/sites/ltn/ltn_test.go new file mode 100644 index 0000000..d4f5e50 --- /dev/null +++ b/internal/fetcher/sites/ltn/ltn_test.go @@ -0,0 +1,77 @@ +package ltn + +import ( + "fmt" + "log" + "net/url" + "testing" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" +) + +var p = PostFactory("https://news.ltn.com.tw/news/world/breakingnews/3277899") + +func PostFactory(rawurl string) *Post { + url, err := url.Parse(rawurl) + if err != nil { + log.Printf("url parse err: %s", err) + } + return &Post{ + Domain: url.Hostname(), + URL: url, + } +} + +func TestSetDate(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := setDate(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + want := "2020-09-01T12:14:01+08:00" + if p.Date != want { + t.Errorf("\ngot: %v\nwant: %v", p.Date, want) + } +} + +func TestSetTitle(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := setTitle(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + want := "反送中12港青逃台被逮 林鄭月娥暗示應「送中」 - 國際" + if p.Title != want { + t.Errorf("\ngot: %v\nwant: %v", p.Title, want) + } +} + +func TestLtn(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + tc, err := ltn(p) + fmt.Println(tc) +} + +func TestSetPost(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := SetPost(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + fmt.Println(p.Title) + fmt.Println(p.Body) +} diff --git a/internal/fetcher/sites/zaobao/zaobao.go b/internal/fetcher/sites/zaobao/zaobao.go index ea7fe9b..fee31f8 100644 --- a/internal/fetcher/sites/zaobao/zaobao.go +++ b/internal/fetcher/sites/zaobao/zaobao.go @@ -66,6 +66,7 @@ func SetTitle(p *Post) error { } title := n[0].FirstChild.Data title = strings.ReplaceAll(title, " | 联合早报网", "") + title = strings.ReplaceAll(title, " | 早报", "") title = strings.TrimSpace(title) gears.ReplaceIllegalChar(&title) p.Title = title diff --git a/internal/htmldoc/htmldoc.go b/internal/htmldoc/htmldoc.go index 422cab2..b959859 100644 --- a/internal/htmldoc/htmldoc.go +++ b/internal/htmldoc/htmldoc.go @@ -84,6 +84,51 @@ func ExtractLinks(weburl string) ([]string, error) { return links, nil } +func DivWithAttr(doc *html.Node, attrName, attrValue string) []*html.Node { + var nodes []*html.Node + if attrName == "" || attrValue == "" || doc == nil { + return nil + } + if doc.Type == html.ElementNode { + if "div" == doc.Data { + for _, a := range doc.Attr { + if a.Key == attrName && a.Val == attrValue { + nodes = append(nodes, doc) + } + } + } + } + for c := doc.FirstChild; c != nil; c = c.NextSibling { + nodes = append(nodes, DivWithAttr(c, attrName, attrValue)...) + } + return nodes +} + +func DivWithAttr2(raw []byte, attrName, attrValue string) []byte { + if attrName == "" || attrValue == "" || raw == nil { + return nil + } + z := html.NewTokenizer(bytes.NewReader(raw)) + for { + tt := z.Next() + t := z.Token() + if err := z.Err(); err != nil && err == io.EOF { + break + } + switch tt { + case html.StartTagToken: + if "div" == t.Data { + for _, a := range t.Attr { + if a.Key == attrName && a.Val == attrValue { + return z.Buffered() + } + } + } + } + } + return nil +} + func ElementsNext(doc *html.Node) []*html.Node { nodes := []*html.Node{} if doc == nil { @@ -232,6 +277,7 @@ func ElementsByTagAndId2(raw []byte, tag, id string) []byte { } return nil } + func ElementsByTagAndType(doc *html.Node, tag, attrType string) []*html.Node { var nodes []*html.Node if tag == "" || attrType == "" || doc == nil { diff --git a/internal/htmldoc/htmldoc_test.go b/internal/htmldoc/htmldoc_test.go index 596f755..dc7372a 100644 --- a/internal/htmldoc/htmldoc_test.go +++ b/internal/htmldoc/htmldoc_test.go @@ -12,8 +12,35 @@ import ( "golang.org/x/net/html" ) +var u, err = url.Parse("https://news.ltn.com.tw/news/world/breakingnews/3277899") + +func TestDivWithAttr(t *testing.T) { + if err != nil { + t.Errorf("url Parse err: %v", err) + } + _, doc, err := GetRawAndDoc(u, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + tc := DivWithAttr(doc, "data-desc", "內容頁") + plist := ElementsByTag(tc[0], "p") + for _, v := range plist { + fmt.Println(v.FirstChild.Data) + } +} +func TestDivWithAttr2(t *testing.T) { + if err != nil { + t.Errorf("url Parse err: %v", err) + } + raw, _, err := GetRawAndDoc(u, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + tc := DivWithAttr2(raw, "data-desc", "內容頁") + fmt.Println(string(tc)) +} + func TestElementsByTagAndClass(t *testing.T) { - u, err := url.Parse("https://www.zaobao.com/realtime/world/story20200825-1079575") if err != nil { t.Errorf("url Parse err: %v", err) } @@ -28,7 +55,6 @@ func TestElementsByTagAndClass(t *testing.T) { } } func TestElementsByTagAndClass2(t *testing.T) { - u, err := url.Parse("https://www.rfa.org/mandarin/yataibaodao/junshiwaijiao/jt-07022020105416.html") if err != nil { t.Errorf("url Parse err: %v", err) } @@ -41,7 +67,6 @@ func TestElementsByTagAndClass2(t *testing.T) { } func TestElementsByTagAndId(t *testing.T) { - u, err := url.Parse("https://www.rfa.org/mandarin/yataibaodao/junshiwaijiao/jt-07022020105416.html") if err != nil { t.Errorf("url Parse err: %v", err) } @@ -69,7 +94,6 @@ func TestElementsByTagAndId(t *testing.T) { } func TestMetaByProperty(t *testing.T) { - u, err := url.Parse("https://www.zaobao.com/realtime/world/story20200825-1079575") if err != nil { t.Errorf("url Parse err: %v", err) } @@ -94,7 +118,6 @@ func TestMetaByProperty(t *testing.T) { } func TestMetaByName(t *testing.T) { - u, err := url.Parse("https://www.dwnews.com/全球/60203304") if err != nil { t.Errorf("url Parse err: %v", err) } diff --git a/main.go b/main.go index 0d7b1f3..547c2a4 100644 --- a/main.go +++ b/main.go @@ -11,6 +11,7 @@ import ( func main() { year := strconv.Itoa(time.Now().Year()) sites := []string{ + "https://news.ltn.com.tw/list/breakingnews", "https://www.zaobao.com/realtime/world", "https://www.zaobao.com/news/world", "https://www.zaobao.com/realtime/china", From e2cda9a14c6ed44746841b49bf839781adad1a5f Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 2 Sep 2020 18:44:25 +0800 Subject: [PATCH 15/32] filter ltn content --- internal/fetcher/post.go | 6 ++++-- internal/fetcher/sites/ltn/ltn.go | 15 ++++++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/internal/fetcher/post.go b/internal/fetcher/post.go index 3c978b0..3693494 100644 --- a/internal/fetcher/post.go +++ b/internal/fetcher/post.go @@ -101,6 +101,8 @@ func (p *Post) TreatPost() error { return err } *p = Post(post) + default: + return fmt.Errorf("switch no case on: %s", p.Domain) } // Save post to file if err := p.setFilename(); err != nil { @@ -116,11 +118,11 @@ func (p *Post) savePost() error { folderPath := filepath.Join("wwwroot", p.Domain) gears.MakeDirAll(folderPath) if p.Filename == "" { - return errors.New("SavePost need a filename, but got none.") + return errors.New("savePost need a filename, but got none.") } filepath := filepath.Join(folderPath, p.Filename) if p.Body == "" { - p.Body = "[-] Fetch error on visit: " + p.URL.String() + p.Body = "savePost p.Body = \"\"" } err := ioutil.WriteFile(filepath, []byte(p.Body), 0644) if err != nil { diff --git a/internal/fetcher/sites/ltn/ltn.go b/internal/fetcher/sites/ltn/ltn.go index 90bee4c..7453d66 100644 --- a/internal/fetcher/sites/ltn/ltn.go +++ b/internal/fetcher/sites/ltn/ltn.go @@ -106,12 +106,21 @@ func ltn(p *Post) (string, error) { if len(ps) == 0 { return "", fmt.Errorf("no <p> matched") } - re = regexp.MustCompile(`<iframe.*?</iframe>`) for _, p := range ps { - p = re.ReplaceAll(p, []byte("")) b.Write(p) b.Write([]byte(" \n")) } + body := b.String() + re = regexp.MustCompile(`「`) + body = re.ReplaceAllString(body, "“") + re = regexp.MustCompile(`」`) + body = re.ReplaceAllString(body, "”") + re = regexp.MustCompile(`<a.*?>`) + body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`</a>`) + body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`<iframe.*?</iframe>`) + body = re.ReplaceAllString(body, "") - return b.String(), nil + return body, nil } From 95b0bdfe24e1f84b8b539c4bdcd780ab4ae098a4 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 2 Sep 2020 19:12:20 +0800 Subject: [PATCH 16/32] rm files with same title saved before. --- internal/fetcher/post.go | 20 ++++++++++++++++++-- internal/fetcher/post_test.go | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/internal/fetcher/post.go b/internal/fetcher/post.go index 3693494..cf3ba5e 100644 --- a/internal/fetcher/post.go +++ b/internal/fetcher/post.go @@ -6,7 +6,9 @@ import ( "io/ioutil" "log" "net/url" + "os" "path/filepath" + "strings" "time" "github.com/wedojava/fetcher/internal/fetcher/sites/boxun" @@ -120,11 +122,25 @@ func (p *Post) savePost() error { if p.Filename == "" { return errors.New("savePost need a filename, but got none.") } - filepath := filepath.Join(folderPath, p.Filename) + fpath := filepath.Join(folderPath, p.Filename) + // !+ rm files with same title + files, err := ioutil.ReadDir(folderPath) + if err != nil { + return err + } + for _, f := range files { + if !f.IsDir() && strings.Contains(f.Name(), p.Title) { + err = os.Remove(filepath.Join(folderPath, f.Name())) + if err != nil { + return err + } + } + } + // !- rm files with same title if p.Body == "" { p.Body = "savePost p.Body = \"\"" } - err := ioutil.WriteFile(filepath, []byte(p.Body), 0644) + err = ioutil.WriteFile(fpath, []byte(p.Body), 0644) if err != nil { return err } diff --git a/internal/fetcher/post_test.go b/internal/fetcher/post_test.go index c970add..34af7ba 100644 --- a/internal/fetcher/post_test.go +++ b/internal/fetcher/post_test.go @@ -36,7 +36,7 @@ func TestTreatPost(t *testing.T) { // "https://www.zaobao.com/realtime/world/story20200825-1079575", // "https://www.zaobao.com/news/world/story20200825-1079477", // "https://www.zaobao.com.sg/realtime/world/story20200901-1081441", - "https://ec.ltn.com.tw/article/breakingnews/3277361", + "https://news.ltn.com.tw/news/world/breakingnews/3278726", } for _, tc := range tcs { p := PostFactory(tc) From fc15d0d238ba5009676c25db19edb852912a97d4 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 2 Sep 2020 20:25:32 +0800 Subject: [PATCH 17/32] fix strong text filtered. --- internal/fetcher/sites/zaobao/zaobao.go | 10 +++++++++- internal/fetcher/sites/zaobao/zaobao_test.go | 10 ++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/internal/fetcher/sites/zaobao/zaobao.go b/internal/fetcher/sites/zaobao/zaobao.go index fee31f8..0de223d 100644 --- a/internal/fetcher/sites/zaobao/zaobao.go +++ b/internal/fetcher/sites/zaobao/zaobao.go @@ -105,12 +105,20 @@ func Zaobao(p *Post) (string, error) { return "", errors.New("[-] There is no tag named `<article>` from: " + p.URL.String()) } plist := htmldoc.ElementsByTag(nodes[0], "p") - for _, v := range plist { // the last item is `推荐阅读:` + for _, v := range plist { if v.FirstChild == nil { continue + } else if v.FirstChild.FirstChild != nil && + v.FirstChild.Data == "strong" { + a := htmldoc.ElementsByTag(v, "span") + for _, aa := range a { + body += aa.FirstChild.Data + } + body += " \n" } else { body += v.FirstChild.Data + " \n" } } + body = strings.ReplaceAll(body, "span \n", "") return body, nil } diff --git a/internal/fetcher/sites/zaobao/zaobao_test.go b/internal/fetcher/sites/zaobao/zaobao_test.go index 46bafba..4ae06ca 100644 --- a/internal/fetcher/sites/zaobao/zaobao_test.go +++ b/internal/fetcher/sites/zaobao/zaobao_test.go @@ -10,6 +10,8 @@ import ( "github.com/wedojava/fetcher/internal/htmldoc" ) +var p = PostFactory("https://www.zaobao.com/news/world/story20200830-1080786") + func PostFactory(rawurl string) *Post { url, err := url.Parse(rawurl) if err != nil { @@ -22,7 +24,6 @@ func PostFactory(rawurl string) *Post { } func TestSetDate(t *testing.T) { - p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) if err != nil { t.Errorf("GetRawAndDoc err: %v", err) @@ -31,14 +32,13 @@ func TestSetDate(t *testing.T) { if err := SetDate(p); err != nil { t.Errorf("test SetPost err: %v", doc) } - want := "2020-08-25T09:42:32+08:00" + want := "2020-08-30T07:48:25+08:00" if p.Date != want { t.Errorf("\ngot: %v\nwant: %v", p.Date, want) } } func TestSetTitle(t *testing.T) { - p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) if err != nil { t.Errorf("GetRawAndDoc err: %v", err) @@ -47,14 +47,13 @@ func TestSetTitle(t *testing.T) { if err := SetTitle(p); err != nil { t.Errorf("test SetPost err: %v", doc) } - want := "韩首都圈学校全面线上上课两周" + want := "国际特稿:美国副总统候选人 哈里斯魅力多元" if p.Title != want { t.Errorf("\ngot: %v\nwant: %v", p.Title, want) } } func TestZaobao(t *testing.T) { - p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) if err != nil { t.Errorf("GetRawAndDoc err: %v", err) @@ -65,7 +64,6 @@ func TestZaobao(t *testing.T) { } func TestSetPost(t *testing.T) { - p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) if err != nil { t.Errorf("GetRawAndDoc err: %v", err) From dd7b8c21d80f9b3ff804e5156ba531d8221b62f4 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Tue, 29 Sep 2020 19:06:51 +0800 Subject: [PATCH 18/32] add new site entrance and modify ltn.com.tw entrance link. --- main.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.go b/main.go index 547c2a4..766b123 100644 --- a/main.go +++ b/main.go @@ -11,7 +11,8 @@ import ( func main() { year := strconv.Itoa(time.Now().Year()) sites := []string{ - "https://news.ltn.com.tw/list/breakingnews", + "https://www.cna.com.tw/list/aall.aspx", + "https://news.ltn.com.tw/list/breakingnews/world", "https://www.zaobao.com/realtime/world", "https://www.zaobao.com/news/world", "https://www.zaobao.com/realtime/china", From f9c19d026ef3948c2b3e1aa4f21b194c6335fe2e Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Tue, 29 Sep 2020 19:42:48 +0800 Subject: [PATCH 19/32] add cna link fetcher and pass test. --- internal/fetcher/links.go | 6 ++++++ internal/fetcher/links_test.go | 23 +++++++++++++++-------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/internal/fetcher/links.go b/internal/fetcher/links.go index 7f392d4..fa0b8dc 100644 --- a/internal/fetcher/links.go +++ b/internal/fetcher/links.go @@ -42,6 +42,12 @@ func (f *Fetcher) SetLinks() error { f.Links = append(append(append(newsWorld, newsChina...), realtimeWorld...), realtimeChina...) case "news.ltn.com.tw": f.Links = LinksFilter(links, `https://news.*/news/.*`) + case "www.cna.com.tw": + newsFirst := LinksFilter(links, `.*?/news/firstnews/.*`) + newsWorld := LinksFilter(links, `.*?/news/aopl/.*`) + newsPolitical := LinksFilter(links, `.*?/news/aipl/.*`) + newsTW := LinksFilter(links, `.*?/news/acn/.*`) + f.Links = append(append(append(newsFirst, newsWorld...), newsPolitical...), newsTW...) } return nil } diff --git a/internal/fetcher/links_test.go b/internal/fetcher/links_test.go index 3d18619..822e6a8 100644 --- a/internal/fetcher/links_test.go +++ b/internal/fetcher/links_test.go @@ -17,7 +17,20 @@ func TestKickOutLinksMatchPath(t *testing.T) { } func TestSetLinks(t *testing.T) { - u, err := url.Parse("https://news.ltn.com.tw/list/breakingnews") + // u, err := url.Parse("https://news.ltn.com.tw/list/breakingnews") + // assertLinks := []string{ + // "https://news.ltn.com.tw/news/society/breakingnews/3278253", + // "https://news.ltn.com.tw/news/society/breakingnews/3278250", + // "https://news.ltn.com.tw/news/politics/breakingnews/3278225", + // "https://news.ltn.com.tw/news/politics/breakingnews/3278170", + // } + u, err := url.Parse("https://www.cna.com.tw/list/aall.aspx") + assertLinks := []string{ + "https://www.cna.com.tw/news/aopl/202009290075.aspx", + "https://www.cna.com.tw/news/firstnews/202009290051.aspx", + "https://www.cna.com.tw/news/acn/202009290063.aspx", + "https://www.cna.com.tw/news/aipl/202009290055.aspx", + } if err != nil { t.Errorf("Url Parse fail!\n%s", err) } @@ -25,12 +38,6 @@ func TestSetLinks(t *testing.T) { Entrance: u, } f.SetLinks() - assertLinks := []string{ - "https://news.ltn.com.tw/news/society/breakingnews/3278253", - "https://news.ltn.com.tw/news/society/breakingnews/3278250", - "https://news.ltn.com.tw/news/politics/breakingnews/3278225", - "https://news.ltn.com.tw/news/politics/breakingnews/3278170", - } shot := 0 for _, link := range f.Links { for _, v := range assertLinks { @@ -39,7 +46,7 @@ func TestSetLinks(t *testing.T) { } } } - if shot == 0 { + if shot != len(assertLinks) { t.Errorf("want: %v, got: %v", len(assertLinks), shot) } } From bb45c07d4455f3197c87b2d7fde00dd09e9fc568 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Tue, 29 Sep 2020 20:09:47 +0800 Subject: [PATCH 20/32] add cna folder and new function to match date for cna --- internal/fetcher/post.go | 3 + internal/fetcher/sites/cna/cna.go | 127 +++++++++++++++++++++++++ internal/fetcher/sites/cna/cna_test.go | 77 +++++++++++++++ internal/htmldoc/htmldoc.go | 43 +++++++-- internal/htmldoc/htmldoc_test.go | 25 +++++ 5 files changed, 267 insertions(+), 8 deletions(-) create mode 100644 internal/fetcher/sites/cna/cna.go create mode 100644 internal/fetcher/sites/cna/cna_test.go diff --git a/internal/fetcher/post.go b/internal/fetcher/post.go index cf3ba5e..ec8b2b6 100644 --- a/internal/fetcher/post.go +++ b/internal/fetcher/post.go @@ -103,6 +103,9 @@ func (p *Post) TreatPost() error { return err } *p = Post(post) + case "www.cna.com.tw": + // TODO: need to be done. + default: return fmt.Errorf("switch no case on: %s", p.Domain) } diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go new file mode 100644 index 0000000..01f9532 --- /dev/null +++ b/internal/fetcher/sites/cna/cna.go @@ -0,0 +1,127 @@ +package cna + +import ( + "bytes" + "fmt" + "net/url" + "regexp" + "strings" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" + "github.com/wedojava/gears" + "golang.org/x/net/html" +) + +type Post struct { + Domain string + URL *url.URL + DOC *html.Node + Raw []byte + Title string + Body string + Date string + Filename string +} + +func SetPost(p *Post) error { + if err := setDate(p); err != nil { + return err + } + if err := setTitle(p); err != nil { + return err + } + if err := setBody(p); err != nil { + return err + } + return nil +} + +func setDate(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + metas := htmldoc.MetasByItemprop(p.DOC, "dateModified") + cs := []string{} + for _, meta := range metas { + for _, a := range meta.Attr { + if a.Key == "content" { + cs = append(cs, a.Val) + } + } + } + if len(cs) <= 0 { + return fmt.Errorf("SetData got nothing.") + } + // TODO: date format's needed. + p.Date = cs[0] + return nil +} + +func setTitle(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + n := htmldoc.ElementsByTag(p.DOC, "title") + if n == nil { + return fmt.Errorf("[-] there is no element <title>") + } + title := n[0].FirstChild.Data + title = strings.ReplaceAll(title, " - 自由時報電子報", "") + title = strings.TrimSpace(title) + gears.ReplaceIllegalChar(&title) + p.Title = title + return nil +} + +func setBody(p *Post) error { + if p.DOC == nil { + return fmt.Errorf("[-] p.DOC is nil") + } + b, err := ltn(p) + if err != nil { + return err + } + t, err := time.Parse(time.RFC3339, p.Date) + if err != nil { + return err + } + h1 := fmt.Sprintf("# [%02d.%02d][%02d%02dH] %s", t.Month(), t.Day(), t.Hour(), t.Minute(), p.Title) + p.Body = h1 + "\n\n" + b + "\n\n原地址:" + p.URL.String() + return nil +} + +func ltn(p *Post) (string, error) { + if p.Raw == nil { + return "", fmt.Errorf("[-] p.Raw is nil") + } + raw := p.Raw + // Fetch content nodes + r := htmldoc.DivWithAttr2(raw, "data-desc", "內容頁") + ps := [][]byte{} + b := bytes.Buffer{} + re := regexp.MustCompile(`<p>(.*?)</p>`) + for _, v := range re.FindAllSubmatch(r, -1) { + ps = append(ps, v[1]) + } + if len(ps) == 0 { + return "", fmt.Errorf("no <p> matched") + } + for _, p := range ps { + b.Write(p) + b.Write([]byte(" \n")) + } + body := b.String() + re = regexp.MustCompile(`「`) + body = re.ReplaceAllString(body, "“") + re = regexp.MustCompile(`」`) + body = re.ReplaceAllString(body, "”") + re = regexp.MustCompile(`<a.*?>`) + body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`</a>`) + body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`<iframe.*?</iframe>`) + body = re.ReplaceAllString(body, "") + + return body, nil +} diff --git a/internal/fetcher/sites/cna/cna_test.go b/internal/fetcher/sites/cna/cna_test.go new file mode 100644 index 0000000..5b635dc --- /dev/null +++ b/internal/fetcher/sites/cna/cna_test.go @@ -0,0 +1,77 @@ +package cna + +import ( + "fmt" + "log" + "net/url" + "testing" + "time" + + "github.com/wedojava/fetcher/internal/htmldoc" +) + +var p = PostFactory("https://www.cna.com.tw/news/aopl/202009290075.aspx") + +func PostFactory(rawurl string) *Post { + url, err := url.Parse(rawurl) + if err != nil { + log.Printf("url parse err: %s", err) + } + return &Post{ + Domain: url.Hostname(), + URL: url, + } +} + +func TestSetDate(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := setDate(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + want := "2020-09-01T12:14:01+08:00" + if p.Date != want { + t.Errorf("\ngot: %v\nwant: %v", p.Date, want) + } +} + +func TestSetTitle(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := setTitle(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + want := "反送中12港青逃台被逮 林鄭月娥暗示應「送中」 - 國際" + if p.Title != want { + t.Errorf("\ngot: %v\nwant: %v", p.Title, want) + } +} + +func TestLtn(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + tc, err := ltn(p) + fmt.Println(tc) +} + +func TestSetPost(t *testing.T) { + raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + p.Raw, p.DOC = raw, doc + if err := SetPost(p); err != nil { + t.Errorf("test SetPost err: %v", doc) + } + fmt.Println(p.Title) + fmt.Println(p.Body) +} diff --git a/internal/htmldoc/htmldoc.go b/internal/htmldoc/htmldoc.go index b959859..1262263 100644 --- a/internal/htmldoc/htmldoc.go +++ b/internal/htmldoc/htmldoc.go @@ -317,16 +317,17 @@ func ElementsNextByTag(doc *html.Node, tag string) []*html.Node { return nodes } -func MetasByName(doc *html.Node, name ...string) []*html.Node { +// MetasByName focus on `<meta name="dateModified" content="2020/09/29 11:27" />` +func MetasByName(doc *html.Node, values ...string) []*html.Node { var nodes []*html.Node - if doc == nil || name == nil { + if doc == nil || values == nil { return nil } if doc.Type == html.ElementNode { if doc.Data == "meta" { for _, a := range doc.Attr { if a.Key == "name" { - for _, v := range name { + for _, v := range values { if v == a.Val { nodes = append(nodes, doc) } @@ -336,21 +337,47 @@ func MetasByName(doc *html.Node, name ...string) []*html.Node { } } for c := doc.FirstChild; c != nil; c = c.NextSibling { - nodes = append(nodes, MetasByName(c, name...)...) + nodes = append(nodes, MetasByName(c, values...)...) } return nodes } -func MetasByProperty(doc *html.Node, name ...string) []*html.Node { +// MetasByItemprop focus on `<meta itemprop="dateModified" content="2020/09/29 11:27" />` +func MetasByItemprop(doc *html.Node, values ...string) []*html.Node { var nodes []*html.Node - if doc == nil || name == nil { + if doc == nil || values == nil { + return nil + } + if doc.Type == html.ElementNode { + if doc.Data == "meta" { + for _, a := range doc.Attr { + if a.Key == "itemprop" { + for _, v := range values { + if v == a.Val { + nodes = append(nodes, doc) + } + } + } + } + } + } + for c := doc.FirstChild; c != nil; c = c.NextSibling { + nodes = append(nodes, MetasByItemprop(c, values...)...) + } + return nodes +} + +// MetasByProperty focus on `<meta property="dateModified" content="2020/09/29 11:27" />` +func MetasByProperty(doc *html.Node, values ...string) []*html.Node { + var nodes []*html.Node + if doc == nil || values == nil { return nil } if doc.Type == html.ElementNode { if doc.Data == "meta" { for _, a := range doc.Attr { if a.Key == "property" { - for _, v := range name { + for _, v := range values { if v == a.Val { nodes = append(nodes, doc) } @@ -360,7 +387,7 @@ func MetasByProperty(doc *html.Node, name ...string) []*html.Node { } } for c := doc.FirstChild; c != nil; c = c.NextSibling { - nodes = append(nodes, MetasByProperty(c, name...)...) + nodes = append(nodes, MetasByProperty(c, values...)...) } return nodes } diff --git a/internal/htmldoc/htmldoc_test.go b/internal/htmldoc/htmldoc_test.go index dc7372a..6a324e4 100644 --- a/internal/htmldoc/htmldoc_test.go +++ b/internal/htmldoc/htmldoc_test.go @@ -117,6 +117,31 @@ func TestMetaByProperty(t *testing.T) { fmt.Println(rt[0]) } +func TestMetaByItemprop(t *testing.T) { + u, err = url.Parse("https://www.cna.com.tw/news/aopl/202009290075.aspx") + if err != nil { + t.Errorf("url Parse err: %v", err) + } + _, doc, err := GetRawAndDoc(u, 1*time.Minute) + if err != nil { + t.Errorf("GetRawAndDoc err: %v", err) + } + tc := MetasByItemprop(doc, "dateModified") + rt := []string{} + for _, n := range tc { + for _, a := range n.Attr { + if a.Key == "content" { + rt = append(rt, a.Val) + } + } + } + want := "2020/09/29 11:49" + if want != rt[0] { + t.Errorf("want: %v, got: %v", want, rt[0]) + } + fmt.Println(rt[0]) +} + func TestMetaByName(t *testing.T) { if err != nil { t.Errorf("url Parse err: %v", err) From 4178c4b0ab0c1c84299ee7a8db56b85694cfd02f Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Tue, 29 Sep 2020 21:30:33 +0800 Subject: [PATCH 21/32] date fmt pass test. --- internal/fetcher/sites/cna/cna.go | 22 ++++++++++++++++++++-- internal/fetcher/sites/cna/cna_test.go | 2 +- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go index 01f9532..d6c4ebb 100644 --- a/internal/fetcher/sites/cna/cna.go +++ b/internal/fetcher/sites/cna/cna.go @@ -5,6 +5,7 @@ import ( "fmt" "net/url" "regexp" + "strconv" "strings" "time" @@ -53,8 +54,25 @@ func setDate(p *Post) error { if len(cs) <= 0 { return fmt.Errorf("SetData got nothing.") } - // TODO: date format's needed. - p.Date = cs[0] + tY := cs[0][:4] + tM := cs[0][5:7] + tD := cs[0][7:9] + tH := cs[0][11:13] + tm := cs[0][14:16] + yy, err := strconv.Atoi(tY) + mm, err := strconv.Atoi(tM) + dd, err := strconv.Atoi(tD) + h, err := strconv.Atoi(tH) + m, err := strconv.Atoi(tm) + if err != nil { + return err + } + // China doesn't have daylight saving. It uses a fixed 8 hour offset from UTC. + secondsEastOfUTC := int((8 * time.Hour).Seconds()) + beijing := time.FixedZone("Beijing Time", secondsEastOfUTC) + t := time.Date(yy, time.Month(mm), dd, h, m, 0, 0, beijing) + p.Date = t.Format(time.RFC3339) + return nil } diff --git a/internal/fetcher/sites/cna/cna_test.go b/internal/fetcher/sites/cna/cna_test.go index 5b635dc..373ea1d 100644 --- a/internal/fetcher/sites/cna/cna_test.go +++ b/internal/fetcher/sites/cna/cna_test.go @@ -32,7 +32,7 @@ func TestSetDate(t *testing.T) { if err := setDate(p); err != nil { t.Errorf("test SetPost err: %v", doc) } - want := "2020-09-01T12:14:01+08:00" + want := "2020-08-31T11:49:00+08:00" if p.Date != want { t.Errorf("\ngot: %v\nwant: %v", p.Date, want) } From 1d12497f7ff7e9ecf308936c7b4fe02a5f916683 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 30 Sep 2020 00:57:40 +0800 Subject: [PATCH 22/32] cna SetBody done. --- internal/fetcher/sites/cna/cna.go | 51 ++++++++++++-------------- internal/fetcher/sites/cna/cna_test.go | 6 +-- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go index d6c4ebb..45ece75 100644 --- a/internal/fetcher/sites/cna/cna.go +++ b/internal/fetcher/sites/cna/cna.go @@ -1,7 +1,7 @@ package cna import ( - "bytes" + "errors" "fmt" "net/url" "regexp" @@ -85,7 +85,7 @@ func setTitle(p *Post) error { return fmt.Errorf("[-] there is no element <title>") } title := n[0].FirstChild.Data - title = strings.ReplaceAll(title, " - 自由時報電子報", "") + title = strings.ReplaceAll(title, " | 中央社 CNA", "") title = strings.TrimSpace(title) gears.ReplaceIllegalChar(&title) p.Title = title @@ -96,7 +96,7 @@ func setBody(p *Post) error { if p.DOC == nil { return fmt.Errorf("[-] p.DOC is nil") } - b, err := ltn(p) + b, err := cna(p) if err != nil { return err } @@ -109,34 +109,29 @@ func setBody(p *Post) error { return nil } -func ltn(p *Post) (string, error) { - if p.Raw == nil { - return "", fmt.Errorf("[-] p.Raw is nil") +func cna(p *Post) (string, error) { + if p.DOC == nil { + return "", fmt.Errorf("[-] p.DOC is nil") } - raw := p.Raw + doc := p.DOC + body := "" // Fetch content nodes - r := htmldoc.DivWithAttr2(raw, "data-desc", "內容頁") - ps := [][]byte{} - b := bytes.Buffer{} - re := regexp.MustCompile(`<p>(.*?)</p>`) - for _, v := range re.FindAllSubmatch(r, -1) { - ps = append(ps, v[1]) - } - if len(ps) == 0 { - return "", fmt.Errorf("no <p> matched") - } - for _, p := range ps { - b.Write(p) - b.Write([]byte(" \n")) + nodes := htmldoc.ElementsByTagAndClass(doc, "div", "paragraph") + if len(nodes) == 0 { + return "", errors.New("[-] There is no element class is paragraph` from: " + p.URL.String()) + } + n := nodes[0] + plist := htmldoc.ElementsByTag(n, "p") + for _, v := range plist { + if v.FirstChild != nil { + body += v.FirstChild.Data + " \n" + } } - body := b.String() - re = regexp.MustCompile(`「`) - body = re.ReplaceAllString(body, "“") - re = regexp.MustCompile(`」`) - body = re.ReplaceAllString(body, "”") - re = regexp.MustCompile(`<a.*?>`) - body = re.ReplaceAllString(body, "") - re = regexp.MustCompile(`</a>`) + + body = strings.ReplaceAll(body, "「", "“") + body = strings.ReplaceAll(body, "」", "”") + + re := regexp.MustCompile(`<a.*?</a>`) body = re.ReplaceAllString(body, "") re = regexp.MustCompile(`<iframe.*?</iframe>`) body = re.ReplaceAllString(body, "") diff --git a/internal/fetcher/sites/cna/cna_test.go b/internal/fetcher/sites/cna/cna_test.go index 373ea1d..6720c58 100644 --- a/internal/fetcher/sites/cna/cna_test.go +++ b/internal/fetcher/sites/cna/cna_test.go @@ -47,19 +47,19 @@ func TestSetTitle(t *testing.T) { if err := setTitle(p); err != nil { t.Errorf("test SetPost err: %v", doc) } - want := "反送中12港青逃台被逮 林鄭月娥暗示應「送中」 - 國際" + want := "擋下TikTok封殺令 美法官:川普可能逾越法律 | 國際" if p.Title != want { t.Errorf("\ngot: %v\nwant: %v", p.Title, want) } } -func TestLtn(t *testing.T) { +func TestCna(t *testing.T) { raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) if err != nil { t.Errorf("GetRawAndDoc err: %v", err) } p.Raw, p.DOC = raw, doc - tc, err := ltn(p) + tc, err := cna(p) fmt.Println(tc) } From 6741378d8f17c9901a5974cb848767752ba965d4 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 30 Sep 2020 01:03:20 +0800 Subject: [PATCH 23/32] set and save cna post pass test. --- internal/fetcher/post.go | 8 ++++++-- internal/fetcher/post_test.go | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/internal/fetcher/post.go b/internal/fetcher/post.go index ec8b2b6..03f5ce9 100644 --- a/internal/fetcher/post.go +++ b/internal/fetcher/post.go @@ -12,6 +12,7 @@ import ( "time" "github.com/wedojava/fetcher/internal/fetcher/sites/boxun" + "github.com/wedojava/fetcher/internal/fetcher/sites/cna" "github.com/wedojava/fetcher/internal/fetcher/sites/dwnews" "github.com/wedojava/fetcher/internal/fetcher/sites/ltn" "github.com/wedojava/fetcher/internal/fetcher/sites/rfa" @@ -104,8 +105,11 @@ func (p *Post) TreatPost() error { } *p = Post(post) case "www.cna.com.tw": - // TODO: need to be done. - + post := cna.Post(*p) + if err := cna.SetPost(&post); err != nil { + return err + } + *p = Post(post) default: return fmt.Errorf("switch no case on: %s", p.Domain) } diff --git a/internal/fetcher/post_test.go b/internal/fetcher/post_test.go index 34af7ba..a861833 100644 --- a/internal/fetcher/post_test.go +++ b/internal/fetcher/post_test.go @@ -12,7 +12,8 @@ import ( func TestSetAndSavePost(t *testing.T) { // p := PostFactory("https://www.dwnews.com/经济/60203253") // p := PostFactory("https://www.rfa.org/mandarin/Xinwen/6-07082020110802.html") // The wrong one - p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") + // p := PostFactory("https://www.zaobao.com/realtime/world/story20200825-1079575") + p := PostFactory("https://www.cna.com.tw/news/aopl/202009290075.aspx") raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) if err != nil { t.Errorf("GetRawAndDoC error: %v", err) @@ -36,7 +37,8 @@ func TestTreatPost(t *testing.T) { // "https://www.zaobao.com/realtime/world/story20200825-1079575", // "https://www.zaobao.com/news/world/story20200825-1079477", // "https://www.zaobao.com.sg/realtime/world/story20200901-1081441", - "https://news.ltn.com.tw/news/world/breakingnews/3278726", + // "https://news.ltn.com.tw/news/world/breakingnews/3278726", + "https://www.cna.com.tw/news/aopl/202009290075.aspx", } for _, tc := range tcs { p := PostFactory(tc) From 9042e38357d467fc721806794c1ee541f8bbe7a4 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 30 Sep 2020 01:21:40 +0800 Subject: [PATCH 24/32] fix date get error. --- README.md | 2 +- internal/fetcher/fetcher_test.go | 3 ++- internal/fetcher/sites/cna/cna.go | 2 +- internal/fetcher/sites/cna/cna_test.go | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 46c5934..20f0641 100644 --- a/README.md +++ b/README.md @@ -11,5 +11,5 @@ Fetch depth: 1 There files need to be modified while add a new site: - main.go: add entrance url - links.go -> SetLinks(): add case about target urls feature regex, eg: if url must have `about`, param 2 is `.*?about.*` -- post.go -> TreatPost(): add case for new site domain. - site/newsite: copy files from sibling folder, then develop and pass the test. +- post.go -> TreatPost(): add case for new site domain. diff --git a/internal/fetcher/fetcher_test.go b/internal/fetcher/fetcher_test.go index 1f18ce5..1c3b47b 100644 --- a/internal/fetcher/fetcher_test.go +++ b/internal/fetcher/fetcher_test.go @@ -15,7 +15,8 @@ func TestCrawl(t *testing.T) { // "https://www.zaobao.com/news/world", // "https://www.voachinese.com", // "https://www.rfa.org/mandarin/", - "https://news.ltn.com.tw/list/breakingnews", + // "https://news.ltn.com.tw/list/breakingnews", + "https://www.cna.com.tw/list/aall.aspx", }) log.Println("Sleep a sec ...") diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go index 45ece75..8e7d5a9 100644 --- a/internal/fetcher/sites/cna/cna.go +++ b/internal/fetcher/sites/cna/cna.go @@ -56,7 +56,7 @@ func setDate(p *Post) error { } tY := cs[0][:4] tM := cs[0][5:7] - tD := cs[0][7:9] + tD := cs[0][8:10] tH := cs[0][11:13] tm := cs[0][14:16] yy, err := strconv.Atoi(tY) diff --git a/internal/fetcher/sites/cna/cna_test.go b/internal/fetcher/sites/cna/cna_test.go index 6720c58..85b6128 100644 --- a/internal/fetcher/sites/cna/cna_test.go +++ b/internal/fetcher/sites/cna/cna_test.go @@ -32,7 +32,7 @@ func TestSetDate(t *testing.T) { if err := setDate(p); err != nil { t.Errorf("test SetPost err: %v", doc) } - want := "2020-08-31T11:49:00+08:00" + want := "2020-09-29T11:49:00+08:00" if p.Date != want { t.Errorf("\ngot: %v\nwant: %v", p.Date, want) } From a07d6aa81c2829e396c8d4bf5b443a8ca74da759 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 30 Sep 2020 02:07:12 +0800 Subject: [PATCH 25/32] add ignore via title words. --- internal/fetcher/sites/cna/cna.go | 11 +++++++++++ internal/fetcher/sites/cna/cna_test.go | 3 ++- internal/fetcher/sites/ltn/ltn.go | 15 +++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go index 8e7d5a9..cffda03 100644 --- a/internal/fetcher/sites/cna/cna.go +++ b/internal/fetcher/sites/cna/cna.go @@ -85,6 +85,17 @@ func setTitle(p *Post) error { return fmt.Errorf("[-] there is no element <title>") } title := n[0].FirstChild.Data + if strings.Contains(title, "| 娛樂 |") || + strings.Contains(title, "| 運動 |") || + strings.Contains(title, "| 文化 |") || + strings.Contains(title, "| 地方 |") || + strings.Contains(title, "| 社會 |") || + strings.Contains(title, "| 生活 |") || + strings.Contains(title, "| 科技 |") || + strings.Contains(title, "| 證券 |") || + strings.Contains(title, "| 產經 |") { + return errors.New("ignore posts: " + p.URL.String()) + } title = strings.ReplaceAll(title, " | 中央社 CNA", "") title = strings.TrimSpace(title) gears.ReplaceIllegalChar(&title) diff --git a/internal/fetcher/sites/cna/cna_test.go b/internal/fetcher/sites/cna/cna_test.go index 85b6128..394bfc1 100644 --- a/internal/fetcher/sites/cna/cna_test.go +++ b/internal/fetcher/sites/cna/cna_test.go @@ -64,13 +64,14 @@ func TestCna(t *testing.T) { } func TestSetPost(t *testing.T) { + var p = PostFactory("https://www.cna.com.tw/news/afe/202009290241.aspx") // should be ignore raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute) if err != nil { t.Errorf("GetRawAndDoc err: %v", err) } p.Raw, p.DOC = raw, doc if err := SetPost(p); err != nil { - t.Errorf("test SetPost err: %v", doc) + t.Errorf("test SetPost err: %v", err) } fmt.Println(p.Title) fmt.Println(p.Body) diff --git a/internal/fetcher/sites/ltn/ltn.go b/internal/fetcher/sites/ltn/ltn.go index 7453d66..c7398bb 100644 --- a/internal/fetcher/sites/ltn/ltn.go +++ b/internal/fetcher/sites/ltn/ltn.go @@ -2,6 +2,7 @@ package ltn import ( "bytes" + "errors" "fmt" "net/url" "regexp" @@ -66,6 +67,20 @@ func setTitle(p *Post) error { return fmt.Errorf("[-] there is no element <title>") } title := n[0].FirstChild.Data + if strings.Contains(title, "- 娛樂 -") || + strings.Contains(title, "- 食譜 -") || + strings.Contains(title, "- 地產 -") || + strings.Contains(title, "- 體育 -") || + strings.Contains(title, "- 地方 -") || + strings.Contains(title, "- 蒐奇 -") || + strings.Contains(title, "- 社會 -") || + strings.Contains(title, "- 生活 -") || + strings.Contains(title, "- 时尚 -") || + strings.Contains(title, "- 健康 -") || + strings.Contains(title, "- 汽車 -") || + strings.Contains(title, "- 財經 -") { + return errors.New("ignore posts: " + p.URL.String()) + } title = strings.ReplaceAll(title, " - 自由時報電子報", "") title = strings.TrimSpace(title) gears.ReplaceIllegalChar(&title) From 6c28284d39462650aba5a2eb60ea95f9ab042060 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 30 Sep 2020 02:14:14 +0800 Subject: [PATCH 26/32] err msg optimized. --- internal/fetcher/sites/cna/cna.go | 2 +- internal/fetcher/sites/ltn/ltn.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go index cffda03..a5f519b 100644 --- a/internal/fetcher/sites/cna/cna.go +++ b/internal/fetcher/sites/cna/cna.go @@ -94,7 +94,7 @@ func setTitle(p *Post) error { strings.Contains(title, "| 科技 |") || strings.Contains(title, "| 證券 |") || strings.Contains(title, "| 產經 |") { - return errors.New("ignore posts: " + p.URL.String()) + return errors.New("ignore post on purpose: " + p.URL.String()) } title = strings.ReplaceAll(title, " | 中央社 CNA", "") title = strings.TrimSpace(title) diff --git a/internal/fetcher/sites/ltn/ltn.go b/internal/fetcher/sites/ltn/ltn.go index c7398bb..d828e23 100644 --- a/internal/fetcher/sites/ltn/ltn.go +++ b/internal/fetcher/sites/ltn/ltn.go @@ -79,7 +79,7 @@ func setTitle(p *Post) error { strings.Contains(title, "- 健康 -") || strings.Contains(title, "- 汽車 -") || strings.Contains(title, "- 財經 -") { - return errors.New("ignore posts: " + p.URL.String()) + return errors.New("ignore post on purpose: " + p.URL.String()) } title = strings.ReplaceAll(title, " - 自由時報電子報", "") title = strings.TrimSpace(title) From 7f1db2ec6ba5fb67a3375e49edb32fe9dc213bf7 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 30 Sep 2020 02:20:16 +0800 Subject: [PATCH 27/32] cna fetch range committed. --- internal/fetcher/sites/cna/cna.go | 2 ++ main.go | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go index a5f519b..6c865c5 100644 --- a/internal/fetcher/sites/cna/cna.go +++ b/internal/fetcher/sites/cna/cna.go @@ -86,6 +86,8 @@ func setTitle(p *Post) error { } title := n[0].FirstChild.Data if strings.Contains(title, "| 娛樂 |") || + strings.Contains(title, "| 政治 |") || + strings.Contains(title, "| 兩岸 |") || strings.Contains(title, "| 運動 |") || strings.Contains(title, "| 文化 |") || strings.Contains(title, "| 地方 |") || diff --git a/main.go b/main.go index 766b123..aa81256 100644 --- a/main.go +++ b/main.go @@ -11,8 +11,9 @@ import ( func main() { year := strconv.Itoa(time.Now().Year()) sites := []string{ - "https://www.cna.com.tw/list/aall.aspx", - "https://news.ltn.com.tw/list/breakingnews/world", + // expand fetch range need update cna.go function: setTitle + "https://www.cna.com.tw/list/aopl.aspx", // 国际 + "https://news.ltn.com.tw/list/breakingnews/world", // 国际 "https://www.zaobao.com/realtime/world", "https://www.zaobao.com/news/world", "https://www.zaobao.com/realtime/china", From 4a2997ed9db16ad1ee144806442256cadcae8245 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 30 Sep 2020 02:32:05 +0800 Subject: [PATCH 28/32] fix post ignore keywords error --- internal/fetcher/sites/ltn/ltn.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/internal/fetcher/sites/ltn/ltn.go b/internal/fetcher/sites/ltn/ltn.go index d828e23..ede94b4 100644 --- a/internal/fetcher/sites/ltn/ltn.go +++ b/internal/fetcher/sites/ltn/ltn.go @@ -67,18 +67,18 @@ func setTitle(p *Post) error { return fmt.Errorf("[-] there is no element <title>") } title := n[0].FirstChild.Data - if strings.Contains(title, "- 娛樂 -") || - strings.Contains(title, "- 食譜 -") || - strings.Contains(title, "- 地產 -") || - strings.Contains(title, "- 體育 -") || - strings.Contains(title, "- 地方 -") || - strings.Contains(title, "- 蒐奇 -") || - strings.Contains(title, "- 社會 -") || - strings.Contains(title, "- 生活 -") || - strings.Contains(title, "- 时尚 -") || - strings.Contains(title, "- 健康 -") || - strings.Contains(title, "- 汽車 -") || - strings.Contains(title, "- 財經 -") { + if strings.Contains(title, "- 娛樂") || + strings.Contains(title, "- 食譜") || + strings.Contains(title, "- 地產") || + strings.Contains(title, "- 體育") || + strings.Contains(title, "- 地方") || + strings.Contains(title, "- 蒐奇") || + strings.Contains(title, "- 社會") || + strings.Contains(title, "- 生活") || + strings.Contains(title, "- 时尚") || + strings.Contains(title, "- 健康") || + strings.Contains(title, "- 汽車") || + strings.Contains(title, "- 財經") { return errors.New("ignore post on purpose: " + p.URL.String()) } title = strings.ReplaceAll(title, " - 自由時報電子報", "") From 36fb1b31b46fad4d56f21f42a8bc498d16965dcf Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 30 Sep 2020 03:20:43 +0800 Subject: [PATCH 29/32] need to fetch H2 content. --- internal/fetcher/sites/cna/cna_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/fetcher/sites/cna/cna_test.go b/internal/fetcher/sites/cna/cna_test.go index 394bfc1..ca04313 100644 --- a/internal/fetcher/sites/cna/cna_test.go +++ b/internal/fetcher/sites/cna/cna_test.go @@ -10,7 +10,7 @@ import ( "github.com/wedojava/fetcher/internal/htmldoc" ) -var p = PostFactory("https://www.cna.com.tw/news/aopl/202009290075.aspx") +var p = PostFactory("https://www.cna.com.tw/news/firstnews/202009295001.aspx") func PostFactory(rawurl string) *Post { url, err := url.Parse(rawurl) @@ -45,9 +45,9 @@ func TestSetTitle(t *testing.T) { } p.Raw, p.DOC = raw, doc if err := setTitle(p); err != nil { - t.Errorf("test SetPost err: %v", doc) + t.Errorf("test SetPost err: %v", err) } - want := "擋下TikTok封殺令 美法官:川普可能逾越法律 | 國際" + want := "早安世界》安心旅遊補助續辦至10月底 中秋雙十連假可用 | 生活 | 重點新聞" if p.Title != want { t.Errorf("\ngot: %v\nwant: %v", p.Title, want) } From 4852a29cc8d6319f9db10890cb049a5e4bd2cbed Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 30 Sep 2020 12:30:58 +0800 Subject: [PATCH 30/32] fix link and h2 text lost --- internal/fetcher/sites/cna/cna.go | 5 +++-- internal/fetcher/sites/cna/cna_test.go | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go index 6c865c5..9078ed0 100644 --- a/internal/fetcher/sites/cna/cna.go +++ b/internal/fetcher/sites/cna/cna.go @@ -134,7 +134,7 @@ func cna(p *Post) (string, error) { return "", errors.New("[-] There is no element class is paragraph` from: " + p.URL.String()) } n := nodes[0] - plist := htmldoc.ElementsByTag(n, "p") + plist := htmldoc.ElementsByTag(n, "h2", "p") for _, v := range plist { if v.FirstChild != nil { body += v.FirstChild.Data + " \n" @@ -143,8 +143,9 @@ func cna(p *Post) (string, error) { body = strings.ReplaceAll(body, "「", "“") body = strings.ReplaceAll(body, "」", "”") + body = strings.ReplaceAll(body, "</a>", "") - re := regexp.MustCompile(`<a.*?</a>`) + re := regexp.MustCompile(`<a.*?>`) body = re.ReplaceAllString(body, "") re = regexp.MustCompile(`<iframe.*?</iframe>`) body = re.ReplaceAllString(body, "") diff --git a/internal/fetcher/sites/cna/cna_test.go b/internal/fetcher/sites/cna/cna_test.go index ca04313..5fb3761 100644 --- a/internal/fetcher/sites/cna/cna_test.go +++ b/internal/fetcher/sites/cna/cna_test.go @@ -10,7 +10,7 @@ import ( "github.com/wedojava/fetcher/internal/htmldoc" ) -var p = PostFactory("https://www.cna.com.tw/news/firstnews/202009295001.aspx") +var p = PostFactory("https://www.cna.com.tw/news/aopl/202009300058.aspx") func PostFactory(rawurl string) *Post { url, err := url.Parse(rawurl) @@ -32,7 +32,7 @@ func TestSetDate(t *testing.T) { if err := setDate(p); err != nil { t.Errorf("test SetPost err: %v", doc) } - want := "2020-09-29T11:49:00+08:00" + want := "2020-09-30T10:54:00+08:00" if p.Date != want { t.Errorf("\ngot: %v\nwant: %v", p.Date, want) } @@ -47,7 +47,7 @@ func TestSetTitle(t *testing.T) { if err := setTitle(p); err != nil { t.Errorf("test SetPost err: %v", err) } - want := "早安世界》安心旅遊補助續辦至10月底 中秋雙十連假可用 | 生活 | 重點新聞" + want := "被爆10年沒繳稅 川普:避稅計畫展現我的才智 | 國際" if p.Title != want { t.Errorf("\ngot: %v\nwant: %v", p.Title, want) } From 874f2765f5a59b2bf48510fdcefc1349737864a7 Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Wed, 30 Sep 2020 12:31:24 +0800 Subject: [PATCH 31/32] update test case --- internal/htmldoc/htmldoc_test.go | 15 +++++--- internal/htmldoc/test.html | 64 ++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/internal/htmldoc/htmldoc_test.go b/internal/htmldoc/htmldoc_test.go index 6a324e4..3366618 100644 --- a/internal/htmldoc/htmldoc_test.go +++ b/internal/htmldoc/htmldoc_test.go @@ -41,17 +41,20 @@ func TestDivWithAttr2(t *testing.T) { } func TestElementsByTagAndClass(t *testing.T) { + s, err := ioutil.ReadFile("./test.html") if err != nil { - t.Errorf("url Parse err: %v", err) + t.Errorf("read file err: %v", err) } - _, doc, err := GetRawAndDoc(u, 1*time.Minute) + doc, err := html.Parse(bytes.NewReader(s)) if err != nil { t.Errorf("GetRawAndDoc err: %v", err) } - tc := ElementsByTagAndClass(doc, "div", "article-content-container") - plist := ElementsByTag(tc[0], "p") - for _, v := range plist { - fmt.Println(v.FirstChild.Data) + tc := ElementsByTagAndClass(doc, "div", "paragraph") + a := ElementsByTag(tc[0], "h2", "p") + for _, v := range a { + if v.FirstChild != nil { + fmt.Println(v.FirstChild.Data) + } } } func TestElementsByTagAndClass2(t *testing.T) { diff --git a/internal/htmldoc/test.html b/internal/htmldoc/test.html index 55804a0..f2e347c 100644 --- a/internal/htmldoc/test.html +++ b/internal/htmldoc/test.html @@ -15,5 +15,69 @@ <br> test text </p> + <div class="paragraph"> + <div class="media"> + <div class="myCustom"> + <span style="font-weight: 400;"> + <div class="book-title"> + <div> + 今晨最新 + </div> + </div> + <div class="book" style="line-height: 1.8"> + <div class="book-content"> + <ul class="book-list"> + <li><a class="moreArticle-link" href="https://www.cna.com.tw/news/firstnews/202009290004.aspx">市長聯盟致歉:台灣六都會籍因技術問題誤分類</a></li> + <li><a class="moreArticle-link" href="https://www.cna.com.tw/news/firstnews/202009290002.aspx">解放軍4海域同時軍演 升高對台美施壓</a></li> + <li><a class="moreArticle-link" href="https://www.cna.com.tw/news/firstnews/202009290009.aspx">美股續漲 道瓊勁揚逾410點</a></li> + </ul> + </div> + </div></span> + </div> + </div> + <h2>安心旅遊補助續辦至10月底 中秋雙十連假可用</h2> + <p>交通部長林佳龍28日向行政院長蘇貞昌報告安心旅遊成效,蘇貞昌支持交通部觀光局安心旅遊補助續辦到10月底,預計中秋及雙十連假出遊民眾,仍可享用這項補助。 (<a href="https://www.cna.com.tw/news/firstnews/202009285009.aspx" class="早安世界延伸1">看完整報導</a>)</p> + <h2>8月景氣燈號轉綠燈 國發會:經濟漸趨回穩</h2> + <p>國發會報喜,28日宣布8月景氣燈號脫離黃藍燈、轉為代表穩定的綠燈,綜合判斷分數也較上月大增5分、升至26分,國發會認為,景氣從低點反轉向上的跡象愈來愈明顯,台灣經濟漸趨回穩。 (<a href="https://www.cna.com.tw/news/firstnews/202009285007.aspx" class="早安世界延伸2">看完整報導</a>)</p> + <h2>武漢肺炎死亡人數破百萬 中國豪賭疫苗效力</h2> + <p>武漢肺炎從中國爆發並蔓延全球迄今不到一年,迄今已逾百萬人喪命。世衛警告,若不採取更多集體行動,病故人數恐倍增。另一方面,中國正大範圍為特定人員接種<a href="https://www.cna.com.tw/news/firstnews/202009280182.aspx" class="早安世界延伸3">2019冠狀病毒疾病疫苗</a>,紐時報導,沒有其他國家在常規藥物試驗程序之外,以如此之大的規模給人們注射未經檢驗的疫苗。中國這種急切的做法相當於一場豪賭。(<a href="https://www.cna.com.tw/news/firstnews/202009280009.aspx" class="早安世界延伸4">看完整報導</a>)</p> + <div class="media"> + <div class="myCustom"> + <span style="font-weight: 400;"> + <div class="dashedline"></div></span> + </div> + </div> + <h2>亞美尼亞、亞塞拜然交戰 30年夙怨重新點燃</h2> + <p>高加索地區的亞塞拜然與亞美尼亞28日進入交戰第2日,已導致數十人喪生,雙方為這場戰鬥指責彼此,世界領袖已促請雙方冷靜,各界擔心這場戰鬥引發全面衝突,可能捲入區域大國俄羅斯和土耳其。「雙亞」夙怨糾結30年,因主張脫離亞塞拜然的納哥諾卡拉巴克地區迭起紛爭。 (<a href="https://www.cna.com.tw/news/firstnews/202009280159.aspx" class="早安世界延伸5">看完整報導</a>)</p> + <h2>TikTok暫逃封殺命運 聯邦法院擋下川普政府禁令</h2> + <p>川普政府原訂28日起禁止美國用戶下載熱門短影音分享應用程式TikTok,聯邦法官27日在最後一刻做出裁決,暫緩實施這項具有政治色彩的禁令,讓TikTok暫時逃過一劫。 (<a href="https://www.cna.com.tw/news/firstnews/202009280016.aspx" class="早安世界延伸6">看完整報導</a>)</p> + <h2>陽光普照代表台灣 角逐奧斯卡最佳國際影片</h2> + <p>導演鍾孟宏執導的電影「陽光普照」,將代表台灣角逐美國第93屆奧斯卡最佳國際影片獎。文化部28日表示,2020年共有18部國片報名,經過甄選後推薦由「陽光普照」代表台灣參賽,甄選委員認為本片「親子議題刻劃深刻,觸動人心,製作品質領先群倫,備受影展肯定,國際能見度佳。」(<a style="background-color: #ffffff;" href="https://www.cna.com.tw/news/firstnews/202009280216.aspx" class="早安世界延伸7">看完整報導</a>)</p> + <div class="media"> + <div class="myCustom"> + <span style="font-weight: 400;"> + <div class="dashedline"></div></span> + </div> + </div> + <h2>菲律賓籍移工檢疫期滿確診武漢肺炎 曾聚餐趴趴走一天半</h2> + <p>一名菲律賓移工檢疫期滿採檢,確診武漢肺炎。但在採檢結果出爐前,個案已進到社區活動約1.5天時間。疫情指揮中心28日表示,現已改為檢疫期滿前採檢,未來不會發生類似狀況。 (<a href="https://www.cna.com.tw/news/ahel/202009280170.aspx" class="早安世界延伸8">看完整報導</a>)</p> + <h2>台藝人中國國慶唱我的祖國若確認違法 最重罰50萬</h2> + <p>藝人歐陽娜娜、張韶涵傳將在中國國慶晚會獻唱「我的祖國」等歌曲,文化部28日表示,如果大陸委員會確認違法,最重可處新台幣50萬元罰鍰。此外,阿美族人楊品驊在海峽論壇上<a href="https://www.cna.com.tw/news/aipl/202009280241.aspx" class="早安世界延伸9">自稱是中國人</a>,原民會主委夷將.拔路兒28日表示,不反對與中國大陸互動,但必須認清自己是中華民國國民,不是中國人;不能忍受用個人的名義代表整個族群。 (<a href="https://www.cna.com.tw/news/firstnews/202009280099.aspx" class="早安世界延伸10">看完整報導</a>)</p> + <div class="media"> + <div class="myCustom"> + <span style="font-weight: 400;"> + <div class="dashedline"></div></span> + </div> + </div> + <h2>川普超會抵稅 當選那年所得稅僅繳750美元</h2> + <p>美國總統大選首場電視辯論即將登場之際,川普的財務紀錄又掀爭議。「紐約時報」27日爆料,川普在贏得大選的2016年,只繳了750美元(約新台幣2萬2000元)聯邦所得稅。川普對紐時的報導不屑一顧,他說自己「付了很多錢,也繳了很多的州所得稅」。 (<a href="https://www.cna.com.tw/news/firstnews/202009280129.aspx" class="早安世界延伸11">看完整報導</a>)</p> + <h2>中國審查制度下 編輯嘆出版太難</h2> + <p>法國經濟學家皮凱提新書無法在中國出版,只是反映出中國審查制度的冰山一角。本地編輯表示,現在有多重出版禁忌,且有時不知道紅線劃在哪裡,出好書越來越難,因為審核制度越來越嚴格,大家選書趨於保守。(<a href="https://www.cna.com.tw/news/acn/202009280323.aspx" class="早安世界延伸12">看完整報導</a>)</p> + <div class="media"> + <div class="myCustom"> + <span style="font-weight: 400;"><p style="background-color: #F6F6F6; padding:30px; text-align:left">上午8點同步發送電子報!快來<a href="https://cna.us19.list-manage.com/subscribe?u=1c186687e418733737656ad4c&id=3a8cd69d5d">訂閱「早安世界」</a>給你最精華的新聞摘要。<br />若有任何建議請來信告訴我們,想獲得更多最新資訊快來<a href="https://www.facebook.com/cnanewstaiwan/?epa=SEARCH_BOX">和中央社做朋友</a>。<br />歡迎訂閱<a href="https://cna.us19.list-manage.com/subscribe?u=1c186687e418733737656ad4c&id=ecac484a79">中央社國際新聞電子報</a>,每週三、日發報,掌握世界脈動。</p></span> + </div> + </div> + </div> </body> </html> From 0489467a7ad972124695bf87074a3920fb45cc2f Mon Sep 17 00:00:00 2001 From: wedojava <wedojava@gmail.com> Date: Fri, 9 Oct 2020 19:59:02 +0800 Subject: [PATCH 32/32] cut script and blockquote elements --- internal/fetcher/sites/cna/cna.go | 2 ++ internal/fetcher/sites/ltn/ltn.go | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go index 9078ed0..d0c2a1b 100644 --- a/internal/fetcher/sites/cna/cna.go +++ b/internal/fetcher/sites/cna/cna.go @@ -147,6 +147,8 @@ func cna(p *Post) (string, error) { re := regexp.MustCompile(`<a.*?>`) body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`<script.*?</script>`) + body = re.ReplaceAllString(body, "") re = regexp.MustCompile(`<iframe.*?</iframe>`) body = re.ReplaceAllString(body, "") diff --git a/internal/fetcher/sites/ltn/ltn.go b/internal/fetcher/sites/ltn/ltn.go index ede94b4..d9edce9 100644 --- a/internal/fetcher/sites/ltn/ltn.go +++ b/internal/fetcher/sites/ltn/ltn.go @@ -134,6 +134,10 @@ func ltn(p *Post) (string, error) { body = re.ReplaceAllString(body, "") re = regexp.MustCompile(`</a>`) body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`<script.*?</script>`) + body = re.ReplaceAllString(body, "") + re = regexp.MustCompile(`<blockquote.*?</blockquote>`) + body = re.ReplaceAllString(body, "") re = regexp.MustCompile(`<iframe.*?</iframe>`) body = re.ReplaceAllString(body, "")