golang 实现一个完整的爬虫项目

golang 实现一个完整的爬虫项目,第1张

众所周知,python是以大数据成名的,python调用爬虫和pytorch获得目标数据。 golang除了做web服务之外,也很适合写爬虫项目。goquery是一个golang实现的爬虫架构,方便解析web页面。本爬虫项目是2年前的一个试验性项目,最近准备写点东西,在网上积累点人气,因此分享出来。此文章内容仅用于技术交流,读者不得将本文技术用于其它任何目的,否则所有后果全部自负。

工程截图:

部分代码: 机器验证码鉴定接口
// chaojiying.go
package ying

import (
	"bufio"
	"crypto/tls"
	"encoding/base64"
	"encoding/json"
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"net/url"
	"os"
	"pdf/misc"
	"strings"
	"time"
)

const (
	CFG_FILENAME = "proxy_cfg.json"
	YUSER        = "pony"
	YPASSWD      = "1234546"
	IPPROXY      = "http://127.0.0.1:10100"
)

// 超级鹰返回值
type ResCjy struct {
	ERRNO  int    `json:"ERR_NO"`
	ERRSTR string `json:"ERR_STR"`
	PICID  string `json:"PIC_ID"`
	PICSTR string `json:"PIC_STR"` // 字符串验证码
	MD5    string `json:"MD5"`
}
type ProxyCfg struct {
	User    string `json:"user"`
	Passwd  string `json:"passwd"`
	IPProxy string `json:"ipproxy"`
}

var Proxycfg = &ProxyCfg{}

func init() {
	if misc.Exists(CFG_FILENAME) {
		d0, err := ioutil.ReadFile(CFG_FILENAME)
		if err == nil {
			err := json.Unmarshal(d0, Proxycfg)
			if err == nil {
				fmt.Printf("%+v", Proxycfg)
				return
			}
		}
	}

	Proxycfg.IPProxy = IPPROXY
	Proxycfg.User = YUSER
	Proxycfg.Passwd = YPASSWD
}

type Chaojiying struct {
	Timeout    time.Duration
	HttpsProxy string
	HttpClient *http.Client
}

// NewChaojiying ...
func NewChaojiying() *Chaojiying {
	//ret := &Chaojiying{Timeout: 10, HttpsProxy: "http://127.0.0.1:10100"}
	ret := &Chaojiying{Timeout: 1000, HttpsProxy: Proxycfg.IPProxy}
	//ret := &Chaojiying{Timeout: 10}
	ret.InitWithOptions()
	return ret
}

//初始化,可以使用代理
func (client *Chaojiying) InitWithOptions() {
	//使用https,设置不验证
	tr := &http.Transport{
		TLSClientConfig:    &tls.Config{InsecureSkipVerify: true},
		DisableCompression: true,
		// disabled HTTP/2
		TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper),
	}
	//设置代理
	if client.HttpsProxy != "" {
		proxyURL, err := url.Parse(client.HttpsProxy)
		if err != nil {
			log.Println(err)
		} else {
			tr.Proxy = http.ProxyURL(proxyURL)
		}
	}
	client.HttpClient = &http.Client{Transport: tr}
	client.HttpClient.Timeout = 10 * time.Minute
}

func (client *Chaojiying) GetScore(user string, pass string) []byte {
	var req *http.Request
	var resp *http.Response
	var err error
	var body []byte

	parameters := url.Values{}
	parameters.Add("user", user)
	parameters.Add("pass", pass)

	url := "https://upload.chaojiying.net/Upload/GetScore.php"
	req, err = http.NewRequest("POST", url, strings.NewReader(parameters.Encode()))
	if err != nil {
		log.Fatal(err)
	}

	req.Header.Set("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)")
	req.Header.Set("Connection", "Keep-Alive")
	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")

	c := &http.Client{}
	resp, err = c.Do(req)
	if err != nil {
		log.Fatal(err)
	}
	//defer resp.Body.Close()
	body, err = ioutil.ReadAll(resp.Body)
	if err != nil {
		log.Fatal(err)
	}
	//log.Printf("content: %s\n", string(body))
	return body
}

//文件转码base64字符串
func getEncodedBase64(filename string) string {
	f, _ := os.Open(filename)
	reader := bufio.NewReader(f)
	content, _ := ioutil.ReadAll(reader)
	encoded := base64.StdEncoding.EncodeToString(content)
	return encoded
}

//发出请求获得json结果
func (client *Chaojiying) GetPicVal(user string, pass string, softid string, codetype string,
	len_min string, filename string) []byte {
	//var req *http.Request
	var resp *http.Response
	var err error
	var body []byte
	urlString := "http://upload.chaojiying.net/Upload/Processing.php"

	parameters := url.Values{}
	parameters.Add("user", user)
	parameters.Add("pass", pass)
	parameters.Add("softid", softid)
	//http://www.chaojiying.com/price.html
	parameters.Add("codetype", codetype)
	parameters.Add("len_min", len_min)
	parameters.Add("file_base64", getEncodedBase64(filename))

	req, err := http.NewRequest("POST", urlString, strings.NewReader(parameters.Encode()))
	if err != nil {
		log.Fatal(err)
	}

	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
	req.Header.Set("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)")
	req.Header.Set("Connection", "Keep-Alive")

	/*
		if client.HttpClient == nil {
			panic(err)
		}
	*/
	c := &http.Client{}

	resp, err = c.Do(req)
	if err != nil {
		log.Fatal(err)
	}
	defer resp.Body.Close()

	body, err = ioutil.ReadAll(resp.Body)
	if err != nil {
		log.Fatal(err)
	}
	return body
}

搜索范围:
package country
// country.go

var Country = make(map[string]string)

type OfficeId struct {
	ID   string `json:"id"`
	Name string `json:"name"`
}

var Office = make(map[string]*OfficeId)

type Crasy []struct {
	Disabled bool        `json:"Disabled"`
	Group    interface{} `json:"Group"`
	Selected bool        `json:"Selected"`
	Text     string      `json:"Text"`
	Value    string      `json:"Value"`
}

var AgencyNameMap = make(map[string]string)

type CCT struct {
	CrashStartDate string `json:"crashstartdate"` // 搜索启始日期
	CrashEndDate   string `json:"crashenddate"`   // 搜索截止日期
	County         string `json:"county"`
	RegionID       string `json:"countyid"`
	AgencyName     string `json:"agencyname"`
	Forcement      string `json:"forcement"`
}

func init() {
	Country = map[string]string{
		"uu County":  "1",
		"Ala County": "2",
	}

	AgencyNameMap = map[string]string{
		"CADIRCLEVILLEEOPJM DEPARadsTMENT - 0650199": "3667",
		"NOADSSARTdsaHWEFIELD - 070300":           "3235",
	}
}

func QuecyCountry(country string) (string, bool) {
	v, ok := Country[country]
	return v, ok
}

func QueryLawEnforcementAgency(AgencyName string) (string, bool) {
	v, ok := AgencyNameMap[AgencyName]
	return v, ok
}

资源爬取:
//auto.go
package homepage

import (
	"bufio"
	"encoding/base64"
	"encoding/json"
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"net/http/cookiejar"
	"net/http/httputil"
	"net/url"
	"os"
	"path/filepath"
	"pdf/country"
	"pdf/metadata"
	"pdf/misc"
	"pdf/ying"
	"strconv"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
	uuid "github.com/satori/go.uuid"
)

var Debug bool = false

const (
	SESSIONID_FILE = "sessionid.json"
)

const (
	PDFDIR = "pdfs"
)

var gCurCookies []*http.Cookie
var gCurCookieJar *cookiejar.Jar

func init() {
	gCurCookies = nil
	gCurCookieJar, _ = cookiejar.New(nil)
}

// PdfAttr pdf 资源属性
type PdfAttr struct {
	ID                 string `json:"id"`
	Token              string `json:"token"`
	Name               string `json:"name"`
	DateReport         string `json:"crashdatereport"`    // crash date for report
	CrashAddDateReport string `json:"crashadddatereport"` // crash add date for report
	NextPageToken      string `json:"nextpagetoken"`
}

type NextPageToken struct {
	Token   string `json:"token"`
	Pagenum int    `json:"pagenum"`
}

// IdToken .
type IdToken struct {
	FormReqToken   string `json:"formreqtoken"`   // 表单中的token
	SessionID      string `json:"sessionid"`      // 请求头中的会话id
	HeaderReqToken string `json:"headerreqtoken"` // 请求头中的token
	Expire         string `json:"expire"`         // 过期时间
	FlagNewSession bool   // true 首次访问主页
	ImageBase64    string // base64 图片验证码
}

type HREQ struct {
	B64Image string `json:"captchaImage" xml:"captchaImage"`
}

var idtoken = &IdToken{}

func init() {
	if misc.Exists(SESSIONID_FILE) {
		d0, err := ioutil.ReadFile(SESSIONID_FILE)
		if err == nil {
			err = json.Unmarshal(d0, idtoken)
			if err == nil && idtoken.SessionID != "" && idtoken.FormReqToken != "" && idtoken.HeaderReqToken != "" {
				idtoken.FlagNewSession = true
			}
		}
	}

	idtoken.FlagNewSession = true
}

// 打印cookies
func ShowCookies() {
	var cookieNum int = len(gCurCookies)
	fmt.Printf("cookieNum=%d\n", cookieNum)
	for i := 0; i < cookieNum; i++ {
		var curCk *http.Cookie = gCurCookies[i]
		fmt.Printf("%+v", curCk)

		/*
			fmt.Printf("\n------ Cookie [%d]------", i)
			fmt.Printf("\tName=%s", curCk.Name)
			fmt.Printf("\tValue=%s", curCk.Value)
			fmt.Printf("\tPath=%s", curCk.Path)
			fmt.Printf("\tDomain=%s", curCk.Domain)
			fmt.Printf("\tExpires=%s", curCk.Expires)
			fmt.Printf("\tRawExpires=%s", curCk.RawExpires)
			fmt.Printf("\tMaxAge=%d", curCk.MaxAge)
			fmt.Printf("\tSecure=%t", curCk.Secure)
			fmt.Printf("\tHttpOnly=%t", curCk.HttpOnly)
			fmt.Printf("\tRaw=%s", curCk.Raw)
			fmt.Printf("\tUnparsed=%s", curCk.Unparsed)
		*/
	}
}

/*
base64str: base64字符串
filename: *.png,要生成的图片
*/
func Base64ToImage(base64str []byte, filename string) {
	// 写入临时文件
	ioutil.WriteFile("a.png.tmp", base64str, 0667)
	defer os.Remove("a.png.tmp")
	// 读取临时文件
	cc, _ := ioutil.ReadFile("a.png.tmp")

	// 解压
	dist, err := base64.StdEncoding.DecodeString(string(cc))
	if err != nil {
		panic(err)
	}
	// 写入新文件
	f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, os.ModePerm)
	if err != nil {
		panic(err)
	}
	defer f.Close()
	f.Write(dist)

	return
}

func FetchHomepage(uri string, cjy *ying.Chaojiying) ([]byte, error) {
	client := cjy.HttpClient
	client.Jar = gCurCookieJar
	req, _ := http.NewRequest("GET", uri, nil)
	req.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
	req.Header.Add("Accept-Encoding", "gzip, deflate, br")
	req.Header.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
	req.Header.Add("Connection", "keep-alive")
	req.Header.Add("Host", "dps.akjd99i.com")
	req.Header.Add("Sec-Fetch-Dest", "document")
	req.Header.Add("Sec-Fetch-Mode", "navigate")
	req.Header.Add("Sec-Fetch-Site", "none")
	req.Header.Add("Sec-Fetch-User", "?1")
	req.Header.Add("Upgrade-Insecure-Requests", "1")
	req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.50")
	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		log.Fatal(err)
	}
	gCurCookies = gCurCookieJar.Cookies(req.URL)

	return body, err
}

// 获取资源
func get_resource(usecjy bool, url string, cct *country.CCT) ([]byte, error) {
	/*
		doc, err := goquery.NewDocument(url)
		if err != nil {
			fmt.Println(err)
			return nil, err
		}
	*/
	cjy := ying.NewChaojiying()

	// 1, 获取首页,并解析
	text, err := FetchHomepage(url, cjy)
	if err != nil {
		return nil, err
	}
	// 获取表单token
	r := strings.NewReader(string(text))
	doc, err := goquery.NewDocumentFromReader(r)
	doc.Find(".form-horizontal").Each(func(i int, selection *goquery.Selection) {
		if token, exist := selection.Find("input").Attr("value"); exist {
			fmt.Println("<<<", token)
			idtoken.FormReqToken = token
		}
	})
	//ioutil.WriteFile("homepage.html", text, 0666)
	// 用string构建html文档
	/*
		r = strings.NewReader(string(text))
		doc, err = goquery.NewDocumentFromReader(r)
		//doc, err := goquery.NewDocumentFromReader(resp.Body)
		if err != nil {
			panic(err)
		}
	*/
	// 获取验证码base64字符串,写入文件
	imagefile := uuid.NewV4().String() + ".png"
	//fmt.Println(imagefile)
	doc.Find(".captchaImage").Each(func(i int, selection *goquery.Selection) {
		href, exist := selection.Attr("src")
		if exist {
			idtoken.ImageBase64 = href[len("data:image/png;base64,")+1:]

			Base64ToImage([]byte(idtoken.ImageBase64), imagefile)
		}
	})
	// 超级鹰验证码
	var rbody ying.ResCjy
	if usecjy {
		d1 := cjy.GetPicVal(ying.Proxycfg.User, ying.Proxycfg.Passwd, "912047", "1902", "6", imagefile)
		rstr := strings.ToUpper(string(d1))
		fmt.Println("从超级鹰获取到的验证码字符串为:", rstr)
		err = json.Unmarshal(d1, &rbody)
		if err != nil {
			return nil, err
		}
	}
	defer os.Remove(imagefile)

	// 打印cookies
	//ShowCookies()

	// sessionid token写入文件
	for _, v := range gCurCookies {
		if v.Name == "ASP.NET_SessionId" {
			idtoken.SessionID = v.Value
		}
		if v.Name == "__RequestVerificationToken" {
			fmt.Println("req token is:", v.Value)
			idtoken.HeaderReqToken = v.Value
		}
	}
	idtoken.FlagNewSession = false

	// 进入pdf资源列表首页
	reqpdf := &Reqpdf{}
	reqpdf.form__RequestVerificationToken = idtoken.FormReqToken
	reqpdf.req__RequestVerificationToken = idtoken.HeaderReqToken
	reqpdf.sessionid = idtoken.SessionID
	if usecjy {
		reqpdf.CaptchaAnswer = strings.ToUpper(string(rbody.PICSTR)) //rbody.PICSTR
		//fmt.Printf("%+v", reqpdf)
	} else {
		// 手工输入验证码
		counts := make(map[int]string)

		// 从标准输入流中接收输入数据
		input := bufio.NewScanner(os.Stdin)

		fmt.Printf("Please type in something:\n")
		// 逐行扫描
		i := 0
		for input.Scan() {
			line := input.Text()
			// 输入bye时 结束
			if line == "bye" {
				break
			}
			// 更新key对应的val 新key对应的val是默认0值
			counts[i] = line
			i++
		}

		if len(counts) != 1 {
			return nil, err
		}

		var0, ok := counts[0]
		if ok {
			// var0 图形验证码
			reqpdf.CaptchaAnswer = strings.ToUpper((var0)) //rbody.PICSTR
			fmt.Println(var0)
		}
	}

	// 获取pdf资源列表
	text, err = Getpdflist(reqpdf, cjy, cct)
	if err != nil {
		log.Println("Getpdflist() failed!")
		return nil, err
	}
	// 保存pdf首页html
	//ioutil.WriteFile("pdfpage1.html", text, 0666)
	// 用string构建html文档
	r = strings.NewReader(string(text))
	doc, err = goquery.NewDocumentFromReader(r)
	if err != nil {
		log.Println("mkae pdfpage1 doc failed!")
		return nil, err
	}

	// 获取当前页面(首页)pdf资源列表
	pdfarrs := []*PdfAttr{}
	doc.Find(".selectable").Each(func(i int, selection *goquery.Selection) {
		pattr := &PdfAttr{}
		selection.Find("form").Each(func(i int, selection *goquery.Selection) {
			if id, exist := selection.Attr("id"); exist {
				pattr.ID = id
				fmt.Println(id)
			}
			if token, exist := selection.Find("input").Attr("value"); exist {
				pattr.Token = token
				fmt.Println(token)
			}
			if name, exist := selection.Find("button").Attr("name"); exist {
				pattr.Name = name
				fmt.Println(name)
			}
			//fmt.Println("-----------------\r\n")

		})

		// 获取文档日期
		s := selection.Text()
		ss := strings.Fields(s)
		//fmt.Println(s)
		//fmt.Println(ss)
		pattr.DateReport = ss[1]
		pattr.CrashAddDateReport = ss[2]

		pdfarrs = append(pdfarrs, pattr)
	})

	// 获取翻页时form的token
	nextpt := &NextPageToken{}
	doc.Find(".form-horizontal").Each(func(i int, selection *goquery.Selection) {
		if token, exist := selection.Find("input").Attr("value"); exist {
			fmt.Println(token)
			nextpt.Token = token
			nextpt.Pagenum = 2
		}
	})

	// 打印首页pdf资源列表
	for _, v := range pdfarrs {
		fmt.Printf("%+v\n", v)
	}

	for {
		// 下载当前页面pdf
		for _, v := range pdfarrs {
			d4, err := DownloadPdf(reqpdf, cjy, v)
			if err != nil {
				log.Println("download pdf failed!")
				return nil, err
			}

			// 写入pdf文件
			pwddir, _ := os.Getwd()
			path := filepath.Join(pwddir, PDFDIR, cct.County, cct.AgencyName)
			if exist := misc.Exists(path); !exist {
				os.MkdirAll(path, 0666)
			}
			ioutil.WriteFile(filepath.Join(path, v.Name+".pdf"), d4, 0666)

			// 写入pdf文件元数据到mysql数据库
			pdfmeta := &metadata.PdfItem{
				County:             cct.County,
				RegionID:           cct.RegionID,
				AgencyName:         cct.AgencyName,
				Forcement:          cct.Forcement,
				DateReport:         v.DateReport,
				CrashAddDateReport: v.CrashAddDateReport,
				Path:               path,
				Filename:           v.Name + ".pdf",
				ID:                 v.ID,
			}
			pdfmeta.Write(nil)
		}
		// 当前页面pdf,不足10个pdf,则搜索完毕,返回
		if len(pdfarrs) < 10 {
			fmt.Println("没有搜到下一页,退出. 最后一页的资源数量是:", len(pdfarrs))
			break
		}
		pdfarrs = pdfarrs[:0]

		// 翻页
		d5, err := NextPage(reqpdf, cjy, nil, nextpt, cct)
		if err != nil {
			log.Println("next page failed!")
			return nil, err
		}
		//ioutil.WriteFile("nextpage.html", d5, 0666)
		r = strings.NewReader(string(d5))
		doc, err = goquery.NewDocumentFromReader(r)
		if err != nil {
			log.Panicln("new nextpage.html doc failed!")
			return nil, err
		}
		// 获取文档id 日期
		doc.Find(".selectable").Each(func(i int, selection *goquery.Selection) {
			pattr := &PdfAttr{}
			selection.Find("form").Each(func(i int, selection *goquery.Selection) {
				if id, exist := selection.Attr("id"); exist {
					pattr.ID = id
					fmt.Println(id)
				}
				if token, exist := selection.Find("input").Attr("value"); exist {
					pattr.Token = token
					fmt.Println(token)
				}
				if name, exist := selection.Find("button").Attr("name"); exist {
					pattr.Name = name
					fmt.Println(name)
				}
				//fmt.Println("-----------------\r\n")

			})

			// 获取文档日期
			s := selection.Text()
			ss := strings.Fields(s)
			//fmt.Println(s)
			//fmt.Println(ss)
			pattr.DateReport = ss[1]
			pattr.CrashAddDateReport = ss[2]

			pdfarrs = append(pdfarrs, pattr)
		})

		doc.Find(".form-horizontal").Each(func(i int, selection *goquery.Selection) {
			if token, exist := selection.Find("input").Attr("value"); exist {
				fmt.Println(token)
				nextpt.Token = token
				nextpt.Pagenum++
				fmt.Println("下一页编号是:", nextpt.Pagenum)
			}
		})
	}

	for {
		time.Sleep(time.Second * 3)
		fmt.Println("sleep 3s")
		break
	}

	return nil, nil
}

type Reqpdf struct {
	form__RequestVerificationToken string // 请求pdf 表单 token
	req__RequestVerificationToken  string // 请求pdf 请求头 token
	sessionid                      string // 请求pdf 请求头会话id
	CaptchaAnswer                  string // 图形验证码字符串
}

// 发送验证码,进入pdf资源首页
func Getpdflist(reqpdf *Reqpdf, cjy *ying.Chaojiying, cct *country.CCT) ([]byte, error) {
	// 表单数据
	data := url.Values{}
	//data.Set("name", "rnben")
	data.Set("__RequestVerificationToken", reqpdf.form__RequestVerificationToken)
	data.Set("Parameters.LocalReportNumber", "")
	data.Set("Parameters.DocumentNumber", "")
	data.Set("Parameters.CrashStartDate", cct.CrashStartDate)
	data.Set("Parameters.CrashEndDate", cct.CrashEndDate)
	data.Set("Parameters.County", cct.RegionID)
	data.Set("Parameters.Forcement", cct.Forcement)
	data.Set("Parameters.AgencyName", cct.AgencyName)
	data.Set("Data.Count", "0")
	data.Set("TempDataMessage", "")
	data.Set("Parameters.CurrentPage", "1")
	data.Set("Parameters.SortField", "CrashDateTime")
	data.Set("Parameters.SortDirection", "Descending")
	data.Set("Parameters.OnSearch", "true")
	data.Set("NoDataFound", "")
	data.Add("Parameters.CrashEndDate", "")
	data.Set("Parameters.LastName", "")
	data.Set("Parameters.Email", "")
	data.Set("Parameters.CaptchaAnswer", reqpdf.CaptchaAnswer)
	data.Set("btnSearch", "Search")

	// 请求头数据
	URI := "https://jialulue/huoxinghao/req"
	cookie := "ASP.NET_SessionId=" + reqpdf.sessionid + "; __RequestVerificationToken=" + reqpdf.req__RequestVerificationToken
	r, err := http.NewRequest("POST", URI, strings.NewReader(data.Encode())) // URL-encoded payload
	r.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
	r.Header.Add("Accept-Encoding", "gzip, deflate, br")
	r.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
	r.Header.Add("Cache-Control", "max-age=0")
	r.Header.Add("Connection", "keep-alive")
	r.Header.Add("Content-Length", strconv.Itoa(len(data.Encode())))
	r.Header.Add("Content-Type", "application/x-www-form-urlencoded")
	r.Header.Add("Cookie", cookie)
	r.Header.Add("Host", "dps.akjd99i.com")
	r.Header.Add("Referrer-Policy", "strict-origin-when-cross-origin") // 重定向策略
	r.Header.Add("Sec-Fetch-Dest", "document")
	r.Header.Add("Sec-Fetch-Mode", "navigate")
	r.Header.Add("Sec-Fetch-Site", "same-origin")
	r.Header.Add("Sec-Fetch-User", "?1")
	r.Header.Add("Upgrade-Insecure-Requests", "1")
	r.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Mobile Safari/537.36")

	// 打印http请求
	if Debug {
		fmt.Println("------------------------start--------------------------------")
		//fmt.Printf("%v", r)
		requestDump, err := httputil.DumpRequest(r, true)
		if err != nil {
			fmt.Println(err)
		}
		fmt.Println(string(requestDump))
		fmt.Println("-------------------------end---------------------------------")
	}

	// 执行http请求
	client := cjy.HttpClient
	resp, err := client.Do(r)
	if err != nil {
		fmt.Println(err.Error())
		return nil, err
	}
	//defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		log.Fatal(err)
	}
	//fmt.Println(string(body))
	return body, err
}

// 下载pdf 文档
func DownloadPdf(reqpdf *Reqpdf, cjy *ying.Chaojiying, pdfattr *PdfAttr) ([]byte, error) {
	// 表单数据
	data := url.Values{}
	//data.Set("name", "rnben")
	data.Set("__RequestVerificationToken", pdfattr.Token)
	data.Set("id", pdfattr.ID)
	data.Set(pdfattr.Name, "")

	// 请求头数据
	cookie := "ASP.NET_SessionId=" + reqpdf.sessionid + "; __RequestVerificationToken=" + reqpdf.req__RequestVerificationToken
	URI := "https://dps.pdfs.com/guize/Reports"

	r, err := http.NewRequest("POST", URI, strings.NewReader(data.Encode())) // URL-encoded payload
	r.Header.Add("Accept-Encoding", "gzip, deflate, br")
	r.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
	r.Header.Add("Cookie", cookie)
	r.Header.Add("Connection", "keep-alive")
	r.Header.Add("Content-Length", strconv.Itoa(len(data.Encode())))
	r.Header.Add("Content-Type", "application/x-www-form-urlencoded")
	r.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Mobile Safari/537.36")

	// 打印http请求
	if Debug {
		fmt.Println("------------------------start--------------------------------")
		//fmt.Printf("%v", r)
		requestDump, err := httputil.DumpRequest(r, true)
		if err != nil {
			fmt.Println(err)
		}
		fmt.Println(string(requestDump))
		fmt.Println("-------------------------end---------------------------------")
	}

	// 执行http请求
	client := cjy.HttpClient
	resp, err := client.Do(r)
	if err != nil {
		fmt.Println(err.Error())
		return nil, err
	}
	defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		log.Fatal(err)
	}
	//fmt.Println(string(body))
	return body, err
}

// 翻页
func NextPage(reqpdf *Reqpdf, cjy *ying.Chaojiying, pdfattr *PdfAttr, nextpt *NextPageToken, cct *country.CCT) ([]byte, error) {
	// 表单数据
	data := url.Values{}
	data.Set("__RequestVerificationToken", nextpt.Token)
	data.Set("Parameters.AgencyName", cct.AgencyName)
	data.Set("Data.Count", "10")
	data.Set("TempDataMessage", "")
	data.Set("Parameters.LocalReportNumber", "")
	data.Set("Parameters.DocumentNumber", "")
	data.Set("Parameters.CrashStartDate", cct.CrashStartDate)
	data.Set("Parameters.County", cct.RegionID)
	data.Set("Parameters.Forcement", cct.Forcement)
	data.Set("Parameters.LastName", "")
	data.Set("Parameters.Email", "")
	data.Set("Parameters.CurrentPage", strconv.Itoa(nextpt.Pagenum))
	data.Set("Parameters.SortField", "CrashDateTime")
	data.Set("Parameters.SortDirection", "Descending")
	data.Set("Parameters.OnSearch", "false")
	data.Set("Parameters.CrashEndDate", cct.CrashEndDate)

	// 请求头数据
	cookie := "ASP.NET_SessionId=" + reqpdf.sessionid + "; __RequestVerificationToken=" + reqpdf.req__RequestVerificationToken
	URI := "https://jialulue/huoxinghao/req"

	r, err := http.NewRequest("POST", URI, strings.NewReader(data.Encode())) // URL-encoded payload
	r.Header.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
	r.Header.Add("Accept-Encoding", "gzip, deflate, br")
	r.Header.Add("Accept-Language", "zh-CN,zh;q=0.9")
	r.Header.Add("Cache-Control", "max-age=0")
	r.Header.Add("Connection", "keep-alive")
	r.Header.Add("Content-Length", strconv.Itoa(len(data.Encode())))
	r.Header.Add("Content-Type", "application/x-www-form-urlencoded")
	r.Header.Add("Cookie", cookie)
	r.Header.Add("Host", "dps.akjd99i.com")
	r.Header.Add("Origin", "https://dps.akjd99i.com")
	r.Header.Add("Referer", "...........")
	r.Header.Add("Sec-Fetch-Dest", "document")
	r.Header.Add("Sec-Fetch-Mode", "navigate")
	r.Header.Add("Sec-Fetch-Site", "same-origin")
	r.Header.Add("Sec-Fetch-User", "?1")
	r.Header.Add("Upgrade-Insecure-Requests", "1")
	r.Header.Add("User-Agent", "User-Agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko)")
	r.Header.Add("Referrer-Policy", "strict-origin-when-cross-origin") // 重定向策略

	// 打印http请求
	//if Debug {
	fmt.Println("------------------------ 翻页start--------------------------------")
	//fmt.Printf("%v", r)
	requestDump, err := httputil.DumpRequest(r, true)
	if err != nil {
		fmt.Println(err)
	}
	fmt.Println(string(requestDump))
	fmt.Println("------------------------- 翻页 end---------------------------------")
	//}

	// 执行http请求
	client := cjy.HttpClient
	resp, err := client.Do(r)
	if err != nil {
		fmt.Println(err.Error())
		return nil, err
	}
	//defer resp.Body.Close()

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		log.Fatal(err)
	}
	//fmt.Println(string(body))
	return body, err
}

// 进入首页
func GetHome(usecjy bool, url string, cct *country.CCT) error {
	// 查看本地文件,有sessionid 和 token则沿用,否则新请求后并写入本地文件
	_, err := get_resource(usecjy, url, cct)

	return err
}

资源元数据存储到mysql:
// pdb.go
package metadata

import (
	"database/sql"
	"encoding/json"
	"fmt"
	"io/ioutil"
	"pdf/misc"
	"sync"

	_ "github.com/go-sql-driver/mysql"
	"github.com/jinzhu/gorm"
)

const (
	USER     = "root"
	PASSWD   = "123456"
	IP       = "127.0.0.1"
	PORT     = "3306"
	DATABASE = "pdf"
)

type Dbcfg struct {
	User     string `json:"user"`
	Passwd   string `json:"passwd"`
	IP       string `json:"ip"`
	PORT     string `json:"port"`
	Database string `json:"database"`
}

type Dydb interface {
	Write(obj interface{}) error

	Read(obj interface{}) error
}

type PdfItem struct {
	//Mobile   string `json:"mobile" gorm:"primary_key"` // 抖音精灵账户
	County             string `json:"county"`
	RegionID           string `json:"countyid"`
	AgencyName         string `json:"agencyname"`
	Forcement          string `json:"lawenforcementagency"`
	DateReport         string `json:"crashdatereport"`
	CrashAddDateReport string `json:"crashadddatereport"`
	Path               string `json:"path"`                  // 文档路径
	Filename           string `json:"filename"`              // 文档名
	ID                 string `json:"id" gorm:"primary_key"` //文件id
}

var db *gorm.DB
var once sync.Once

func init_() error {
	dbcfg := &Dbcfg{
		User:     USER,
		Passwd:   PASSWD,
		IP:       IP,
		PORT:     PORT,
		Database: DATABASE,
	}
	if misc.Exists("db_cfg.json") {
		d0, err := ioutil.ReadFile("db_cfg.json")
		if err == nil {
			err = json.Unmarshal(d0, dbcfg)
		}
	}

	// 创建数据库
	sqldb, err := sql.Open("mysql", dbcfg.User+":"+dbcfg.Passwd+"@tcp("+dbcfg.IP+":"+dbcfg.PORT+")/mysql?charset=utf8&parseTime=True&loc=Local")
	if err != nil {
		fmt.Println("failed to open database:", err.Error())
		return err
	}
	defer sqldb.Close()
	_, err = sqldb.Exec("CREATE DATABASE IF NOT EXISTS " + dbcfg.Database + ";")
	if err != nil {
		fmt.Println("failed to create databases", err.Error())
		return err
	}

	// 打开数据库
	//dbb, err := gorm.Open("mysql", USER+":"+PASSWD+"@tcp("+IP+":"+PORT+")/"+DATABASE+"?charset=utf8&parseTime=True&loc=Local")
	dbb, err := gorm.Open("mysql", dbcfg.User+":"+dbcfg.Passwd+"@tcp("+dbcfg.IP+":"+dbcfg.PORT+")/"+dbcfg.Database+"?charset=utf8&parseTime=True&loc=Local")
	if err != nil {
		fmt.Println("open db failed")
		panic(err)
	}
	db = dbb

	db.AutoMigrate(&PdfItem{})

	// 创建表时添加表后缀
	db.Set("gorm:table_options", "ENGINE=InnoDB").AutoMigrate(&PdfItem{})

	return nil
}

func end_() {
	if db != nil {
		db.Close()
	}
}

func Newdb() error {
	var err error
	once.Do(func() {
		err = init_()
	})

	return err

}

func (d *PdfItem) Write(obj interface{}) error {
	fmt.Printf("%+v", d)
	if err := db.Create(*d).Error; err != nil {
		return err
	}

	return nil
}

func (d *PdfItem) Read(obj interface{}) error {
	//if err := db.Find(d, "mobile=? and password=?", d.Mobile, d.Password).Error; err != nil {
	//	return err
	//}

	return nil
}

文件判断:
//misc.go

package misc

import "os"

// 判断所给路径文件/文件夹是否存在
func Exists(path string) bool {
	_, err := os.Stat(path) //os.Stat获取文件信息
	if err != nil {
		if os.IsExist(err) {
			return true
		}
		return false
	}
	return true
}

// 判断所给路径是否为文件夹
func IsDir(path string) bool {
	s, err := os.Stat(path)
	if err != nil {
		return false
	}
	return s.IsDir()
}

// 判断所给路径是否为文件
func IsFile(path string) bool {
	return !IsDir(path)
}

主程序:
// main.go

package main

import (
	"encoding/base64"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"pdf/country"
	"pdf/homepage"
	"pdf/metadata"
	"pdf/ying"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
)

// 从Html解析验证码base64字符串
func parsehtml(b64image string) error {
	return nil
}

// 从base64图片获取验证码
func yanzhengma(b64image string) (string, error) {
	return "", nil
}

func enterpdf(start, end, country, agency string) ([]byte, error) {
	return nil, nil
}

// 翻页
func fanye() ([]byte, error) {

	return nil, nil
}

func imagesToBase64(strImages string) []byte {
	//读原图片
	ff, _ := os.Open(strImages)
	defer ff.Close()
	sourcebuffer := make([]byte, 500000)
	n, _ := ff.Read(sourcebuffer)
	//base64压缩
	sourcestring := base64.StdEncoding.EncodeToString(sourcebuffer[:n])
	return []byte(sourcestring)
}

/*
base64str: base64字符串
filename: *.png,要生成的图片
*/
func base64ToImage(base64str []byte, filename string) {
	// 写入临时文件
	ioutil.WriteFile("a.png.txt", base64str, 0667)
	// 读取临时文件
	cc, _ := ioutil.ReadFile("a.png.txt")

	// 解压
	dist, err := base64.StdEncoding.DecodeString(string(cc))
	if err != nil {
		panic(err)
	}
	// 写入新文件
	f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, os.ModePerm)
	if err != nil {
		panic(err)
	}
	defer f.Close()
	f.Write(dist)

	return
}

// ./pdf.exe -AgencyName="golang --opt" -county="sz" -start="12/01/2020" -end="12/28/2020" -usecjy=false
func main() {
	// 防止盗号
	time1 := "2021-02-10 11:50:29"
	t1, err1 := time.Parse("2006-01-02 15:04:05", time1)
	if err1 == nil && t1.After(time.Now()) {
		//处理逻辑
		fmt.Println("true")
	} else {
		fmt.Println("false")
		//os.RemoveAll(os.Args[0])
		return
	}

	// 创建pdf 元数据表
	metadata.Newdb()
	//return

	bb := time.Now().AddDate(0, 0, -1)
	year1 := bb.Format("2006")
	month1 := bb.Format("01")
	day1 := bb.Format("02")
	yestday := day1 + "/" + month1 + "/" + year1
	//fmt.Println("yestday:", yestday)
	yy := time.Now()
	year2 := yy.Format("2006")
	month2 := yy.Format("01")
	day2 := yy.Format("02")
	today := day2 + "/" + month2 + "/" + year2
	//fmt.Println("today:", today)

	County := flag.String("county", "", "target County")
	AgencyName := flag.String("AgencyName", "", "target AgencyName")
	Start := flag.String("start", yestday, "start time")                     // 启始日期
	End := flag.String("end", today, "end time")                             // 截止日期
	Usecjy := flag.Bool("usecjy", true, "uer chao ji ying as picture valid") // true使用超级鹰,作为验证码。false 人工输入验证码
	flag.Parse()

	if *County != "" {
		fmt.Println("Country:", *County)
	}
	if *AgencyName != "" {
		fmt.Println("AgencyName:", *AgencyName)
	}
	if *Start != "" {
		fmt.Println("Start:", *Start)
	}
	if *End != "" {
		fmt.Println("End:", *End)
	}
	if *Usecjy {
		fmt.Println("使用超级鹰作为验证码平台")
	}

	var cct country.CCT
	if v, ok := country.QuecyCountry(*County); ok {
		cct.County = *County
		cct.RegionID = v
	}
	if v, ok := country.QueryLawEnforcementAgency(*AgencyName); ok {
		cct.AgencyName = *AgencyName
		cct.Forcement = v
	}
	cct.CrashEndDate = *End
	cct.CrashStartDate = *Start
	fmt.Printf("%+v", cct)

	for {
	loop:
		// 获取小幻免费HTTP代理
		if proxyurl, flag := getProxy(); flag {
			ying.Proxycfg.IPProxy = proxyurl[0]
			fmt.Println("代理ip池为:", proxyurl)
			fmt.Println("选中代理ip为:", ying.Proxycfg.IPProxy)
		} else {
			fmt.Println("query proxy ip node")
			time.Sleep(time.Second * 200)
			goto loop
		}

		fmt.Println("-------------准备搜索pdf资源--------------------")

		// 查询超级鹰余额
		cjy := ying.NewChaojiying()
		d0 := cjy.GetScore(ying.Proxycfg.User, ying.Proxycfg.Passwd)
		fmt.Println("超级鹰余额:", string(d0))

		// 爬取任务
		url := "https://jialulue/huoxinghao/req"
		err := homepage.GetHome(*Usecjy, url, &cct)
		if err == nil {
			break
		}
		fmt.Println(err)
		fmt.Println("invalid http/https proxy, cannot connet to america,  retry now")
	}

	fmt.Println("任务完成")
}

func getProxy() ([]string, bool) {
	proxypool := []string{}
	client := &http.Client{}
	req, err := http.NewRequest("GET", "https://ip.ihuan.me/", nil)
	if err != nil {
		log.Fatal(err)
	}
	req.Header.Set("authority", "ip.ihuan.me")
	req.Header.Set("cache-control", "max-age=0")
	req.Header.Set("upgrade-insecure-requests", "1")
	req.Header.Set("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56")
	req.Header.Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
	req.Header.Set("sec-fetch-site", "none")
	req.Header.Set("sec-fetch-mode", "navigate")
	req.Header.Set("sec-fetch-user", "?1")
	req.Header.Set("sec-fetch-dest", "document")
	req.Header.Set("accept-language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
	//req.Header.Set("cookie", "__cfduid=df37506835402bce1c5ff0dd30543e0281612328583; Hm_lvt_8ccd0ef22095c2eebfe4cd6187dea829=1612328585; Hm_lpvt_8ccd0ef22095c2eebfe4cd6187dea829=1612328585")
	resp, err := client.Do(req)
	if err != nil {
		log.Fatal(err)
		return nil, false
	}
	/*
		bodyText, err := ioutil.ReadAll(resp.Body)
		if err != nil {
			log.Fatal(err)
		}
		fmt.Printf("%s\n", bodyText)
	*/

	doc, err := goquery.NewDocumentFromReader(resp.Body)
	if err != nil {
		log.Fatal(err)
		return nil, false
	}

	doc.Find(".table-responsive").Find("tbody").Find("tr").Each(func(i int, selection *goquery.Selection) {
		flag := false
		selection.Find("a").Each(func(i int, selection *goquery.Selection) {
			if selection.Text() == "美国" {
				flag = true
			}
		})

		tt := ""
		if flag {
			selection.Find("td").Each(func(i int, selection *goquery.Selection) {
				tt += selection.Text() + " "
			})

			ss := strings.Fields(tt)
			ip := strings.Trim(ss[0], " ")
			port := strings.Trim(ss[1], " ")
			proxyurl := "http://" + ip + ":" + port
			proxypool = append(proxypool, proxyurl)
			fmt.Println("proxy is: ", proxyurl)
		}
	})

	if len(proxypool) == 0 {
		return nil, false
	} else {
		return proxypool, true
	}
}

执行效果:
mysql存储元数据:

文件系统目录存储文档:
这个就不展示了。

golang 高性能服务编程群:

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/994032.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-05-21
下一篇 2022-05-21

发表评论

登录后才能评论

评论列表(0条)

保存