• 欢迎访问1024小神,一个只会Python的程序猿不是一个好司机
  • 有什么想对我说的可以在留言板里给我留言哦~
  • 如果您觉得本站非常有看点,那么赶紧使用Ctrl+D 收藏1024小神吧

go单任务版爬虫(爬取珍爱网)

爬虫实战 Jason 9个月前 (02-27) 541次浏览 10个评论

爬虫总体算法

单任务版爬虫架构

任务

获取并打印所在城市第一页用户的详细信息

代码实现

/crawler/main.go
package main

import (
	"crawler/engine"
	"crawler/zhenai/parser"
)

func main() {
	engine.Run(engine.Request{
		Url:"http://www.zhenai.com/zhenghun",
		ParseFunc: parser.ParseCityList,
	})
}
/crawler/engine/engine.go
package engine

import (
	"crawler/fetcher"
	"log"
)
// seeds:url种子
func Run(seeds ...Request)  {
	var requests []Request
	for _, r := range seeds {
		requests = append(requests, r)
	}
	for len(requests) > 0 {
		r := requests[0]
		requests = requests[1:]
		log.Printf("Fetching: %s", r.Url)
		body, err := fetcher.Fetch(r.Url)
		if err != nil {
			log.Printf("Fetcher: error" + "fetching url %s: %v", r.Url, err)
			continue
		}
		parseResult := r.ParseFunc(body)
		requests = append(requests, parseResult.Requests...)
		for _, item := range parseResult.Items{
			log.Printf("Got item %v", item)
		}
	}
}

/crawler/engine/types.go
package engine

type Request struct {
	Url string
	ParseFunc func([]byte) ParseResult
}
type ParseResult struct {
	Requests []Request
	Items []interface{}
}
func NilParser([]byte) ParseResult{
	return ParseResult{}
}
/crawler/fetcher/fetcher.go
package fetcher

import (
	"bufio"
	"fmt"
	"golang.org/x/net/html/charset"
	"golang.org/x/text/encoding"
	"golang.org/x/text/encoding/unicode"
	"golang.org/x/text/transform"
	"io/ioutil"
	"log"
	"net/http"
)

//获取内容
func Fetch(url string) ([]byte, error)  {
	client := &http.Client{}
	req, err := http.NewRequest("GET", url, nil)
	if err != nil {
		return nil, err
	}
	req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")
	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()
	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("Wrong status code: %d", resp.StatusCode)
	}
	bodyReader := bufio.NewReader(resp.Body)
	e := determineEncoding(bodyReader)
	utf8Reader := transform.NewReader(bodyReader, e.NewDecoder())
	return ioutil.ReadAll(utf8Reader)
}
//转换编码
func determineEncoding(r *bufio.Reader) encoding.Encoding  {
	bytes, err := r.Peek(1024)
	if err != nil {
		log.Printf("Fetcher error: %v", err)
		return unicode.UTF8
	}
	e, _, _ := charset.DetermineEncoding(bytes, "")
	return e
}
/crawler/zhenai/parser/citylist.go
package parser

import (
	"crawler/engine"
	"regexp"
)
//解析UTF8文本,获取城市url列表
const cityListRe  = `<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)" [^>]*>([^<]+)</a>`
func ParseCityList(contents []byte) engine.ParseResult {
	re := regexp.MustCompile(cityListRe)
	matches := re.FindAllSubmatch(contents, -1)
	result := engine.ParseResult{}
	limit := 5 //设置限制,便于测试
	for _, m := range matches {
		result.Items = append(result.Items, "City: "+string(m[2]))
		result.Requests = append(result.Requests, engine.Request{
			Url:       string(m[1]),
			ParseFunc: ParseCity,
		})
		limit--
		if limit == 0 {
			break
		}
	}
	return result
}
/crawler/zhenai/parser/city.go
package parser

import (
	"crawler/engine"
	"regexp"
)
//解析单个城市,获取用户
const cityRe = `<a href="(http://album.zhenai.com/u/[0-9]+)" [^>]*>([^<]+)</a>`
func ParseCity(contents []byte) engine.ParseResult {
	re := regexp.MustCompile(cityRe)
	matches := re.FindAllSubmatch(contents, -1)
	result := engine.ParseResult{}
	for _, m := range matches {
		name := string(m[2])
		result.Items = append(result.Items, "User: "+name)
		result.Requests = append(result.Requests, engine.Request{
			Url:       string(m[1]),
			ParseFunc: func(c []byte) engine.ParseResult {
				return ParseProfile(c, "name:"+name)
			},
		})
	}
	return result
}
/crawler/zhenai/parser/profile.go
package parser

import (
	"crawler/engine"
	"crawler/model"
	"regexp"
)
//获取用户信息
const all = `<div  data-v-8b1eac0c>([^<]+)</div>`
func ParseProfile(contents []byte, name string) engine.ParseResult {
	profile := model.Profile{}
	profile.User = append(profile.User, name)
	re := regexp.MustCompile(all)
	match := re.FindAllSubmatch(contents,-1)
	if match != nil {
		for _, m := range match {
			profile.User = append(profile.User, string(m[1]))
		}
	}

	result := engine.ParseResult{
		Items: []interface{}{profile},
	}
	return result
}
/crawler/model/profile.go
package model
type Profile struct {
	User []string
}
任务结果


如有失效,请留言告知丨转载请注明原文链接:go单任务版爬虫(爬取珍爱网)
点赞 (10)

您必须 登录 才能发表评论!

(10)个小伙伴在吐槽
  1. qingmingmeng
    爬珍爱网,你是为了啥,大声说出来
    2021-11-16 13:45
  2. decim
    1024!签到时间:2021-08-17 16:05:31,每日打卡,生活更精彩哦~
    2021-08-17 16:05
  3. hzj_2018
    1024!签到时间:2021-07-09 17:36:24,每日打卡,生活更精彩哦~
    2021-07-09 17:36
  4. 寻梦
    单身女性都在这了?
    2021-06-02 13:34
  5. qq018
    太深奥了
    2021-04-17 13:17
  6. qq018
    看不懂,太深了
    2021-04-16 16:59
  7. 鑫鑫
    1024!签到时间:2021-04-05 21:47:02,每日打卡,生活更精彩哦~
    2021-04-05 21:47
  8. 来来来
    也在学
    2021-04-04 22:46
  9. fioes
    fioes
    感谢分享!1024!签到时间:2021-03-07 11:57:18,每日打卡,生活更精彩哦~
    2021-03-07 11:58
  10. tompeng
    谢谢分享
    2021-03-05 14:46