goquery 处理中文网页
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">

爬虫处理网页,标签用的gb2312字符集,goquery拿到的是乱码

刚开始用的


func DecodeToGBK2312(text string) string {

	dst := make([]byte, len(text)*2)
	tr := simplifiedchinese.HZGB2312.NewDecoder()
	nDst, _, _ := tr.Transform(dst, []byte(text), true)
	return string(dst[:nDst])
}

还是乱码。。。。

后来在wiki里找了一种方法

func detectContentCharset(body io.Reader) string {
	r := bufio.NewReader(body)
	if data, err := r.Peek(1024); err == nil {
		if _, name, ok := charset.DetermineEncoding(data, ""); ok {
			return name
		}
	}
	return "utf-8"
}

// DecodeHTMLBody returns an decoding reader of the html Body for the specified `charset`
// If `charset` is empty, DecodeHTMLBody tries to guess the encoding from the content
func DecodeHTMLBody(body io.Reader, charset string) (io.Reader, error) {
	if charset == "" {
		charset = detectContentCharset(body)
	}
	e, err := htmlindex.Get(charset)
	if err != nil {
		return nil, err
	}
	if name, _ := htmlindex.Name(e); name != "utf-8" {
		body = e.NewDecoder().Reader(body)
	}
	return body, nil
}

试了下如果传空字符串过去让他猜他也猜不出来

        docbody, _ := DecodeHTMLBody(resp.Body,"gb2312")
	doc, _ := goquery.NewDocumentFromReader(docbody)

这样可以