<meta http-equiv="Content-Type" content="text/html; charset=gb2312">
爬虫处理网页,标签用的gb2312字符集,goquery拿到的是乱码
刚开始用的
func DecodeToGBK2312(text string) string {
dst := make([]byte, len(text)*2)
tr := simplifiedchinese.HZGB2312.NewDecoder()
nDst, _, _ := tr.Transform(dst, []byte(text), true)
return string(dst[:nDst])
}
还是乱码。。。。
后来在wiki里找了一种方法
func detectContentCharset(body io.Reader) string {
r := bufio.NewReader(body)
if data, err := r.Peek(1024); err == nil {
if _, name, ok := charset.DetermineEncoding(data, ""); ok {
return name
}
}
return "utf-8"
}
// DecodeHTMLBody returns an decoding reader of the html Body for the specified `charset`
// If `charset` is empty, DecodeHTMLBody tries to guess the encoding from the content
func DecodeHTMLBody(body io.Reader, charset string) (io.Reader, error) {
if charset == "" {
charset = detectContentCharset(body)
}
e, err := htmlindex.Get(charset)
if err != nil {
return nil, err
}
if name, _ := htmlindex.Name(e); name != "utf-8" {
body = e.NewDecoder().Reader(body)
}
return body, nil
}
试了下如果传空字符串过去让他猜他也猜不出来
docbody, _ := DecodeHTMLBody(resp.Body,"gb2312")
doc, _ := goquery.NewDocumentFromReader(docbody)
这样可以