時間:2022-08-06 22:12:01 | 來源:網站運營
時間:2022-08-06 22:12:01 來源:網站運營
最后執(zhí)行下面命令 ```shell scriptgo mod tidygo mod vendorgo build
編譯結束后,配置config。重命名config.dist.json為config.json,打開config.json,修改mysql部分的配置,填寫為你的mysql地址、用戶名、密碼、數據庫信息,新建cobweb數據庫,導入mysql.sql到填寫的數據庫中,然后雙擊運行可執(zhí)行文件即可開始采集之旅。{ "mysql": { //數據庫配置 "Database": "spider", "User": "root", "Password": "root", "Charset": "utf8mb4", "Host": "127.0.0.1", "TablePrefix": "", "Port": 3306, "MaxIdleConnections": 1000, "MaxOpenConnections": 100000 }}
var MaxChan = 100var waitGroup sync.WaitGroupvar ch = make(chan string, MaxChan)func SingleSpider(){ var websites []Website var counter int DB.Model(&Website{}).Where("`status` = 0").Limit(MaxChan*10).Count(&counter).Find(&websites) if counter > 0 { for _, v := range websites { ch <- v.Domain waitGroup.Add(1) go SingleData2(v) } } else { log.Println("等待數據中,10秒后重試") time.Sleep(10 * time.Second) } SingleSpider()}
//鎖定當前數據 DB.Model(&website).Where("`id` = ?", website.ID).Update("status", 2) log.Println(fmt.Sprintf("開始采集:%s://%s", website.Scheme, website.Domain)) err := website.GetWebsite() if err == nil { website.Status = 1 } else { website.Status = 3 } log.Println(fmt.Sprintf("入庫2:%d:%s",website.ID, website.Domain)) DB.Save(&website)
DB.Exec("insert into website(`domain`, `scheme`,`title`) select ?,?,? from dual where not exists(select id from website where `domain` = ?)", v.Domain, v.Scheme, v.Title, v.Domain)
contentType := strings.ToLower(resp.Header.Get("Content-Type")) log.Println(contentType) var htmlEncode string if contentType == "" { //先嘗試讀取charset reg := regexp.MustCompile(`(?is)charset=["']?/s*([a-z0-9/-]+)`) match := reg.FindStringSubmatch(body) if len(match) > 1 { htmlEncode = strings.ToLower(match[1]) if htmlEncode != "utf-8" && htmlEncode != "utf8" { body = ConvertToString(body, "gbk", "utf-8") } } else { reg = regexp.MustCompile(`(?is)<title[^>]*>(.*?)<//title>`) match = reg.FindStringSubmatch(body) if len(match) > 1 { aa := match[1] _, htmlEncode, _ = charset.DetermineEncoding([]byte(aa), "") if htmlEncode != "utf-8" { body = ConvertToString(body, "gbk", "utf-8") } } } } else if !strings.Contains(contentType, "utf-8") { body = ConvertToString(body, "gbk", "utf-8") }
//嘗試獲取微信 reg := regexp.MustCompile(`(?i)(微信|微信客服|微信號|微信咨詢|微信服務)/s*(:|:|/s)/s*([a-z0-9/-_]{4,30})`) match := reg.FindStringSubmatch(contentText) if len(match) > 1 { website.WeChat = match[3] } //嘗試獲取QQ reg = regexp.MustCompile(`(?i)(QQ|QQ客服|QQ號|QQ號碼|QQ咨詢|QQ聯(lián)系|QQ交談)/s*(:|:|/s)/s*([0-9]{5,12})`) match = reg.FindStringSubmatch(contentText) if len(match) > 1 { website.QQ = match[3] } //嘗試獲取電話 reg = regexp.MustCompile(`([0148][1-9][0-9][0-9/-]{4,15})`) match = reg.FindStringSubmatch(contentText) if len(match) > 1 { website.Cellphone = match[1] }
關鍵詞:信息,訪問,獲取,把手