加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 百科 > 正文

Ruby多进程并行抓取网页

发布时间:2020-12-17 04:12:10 所属栏目:百科 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 #!/usr/bin/env ruby#encoding: UTF-8# 抓取每一个站点的首页链接数量# require 'rubygems'# 1.8.7require 'ap'# gem install awesome_printrequire '

以下代码由PHP站长网 52php.cn收集自互联网

现在PHP站长网小编把它分享给大家,仅供参考

#!/usr/bin/env ruby
#encoding: UTF-8

# 抓取每一个站点的首页链接数量
# require 'rubygems'			# 1.8.7
require 'ap'				# gem install awesome_print
require 'json'
require 'net/http'
require 'nokogiri'			# gem install nokogiri
require 'forkmanager'		# gem install parallel-forkmanager
require 'beanstalk-client'	# gem install beanstalk-client

class MultipleCrawler

	class Crawler
		def initialize(user_agent,redirect_limit=1)
			@user_agent = user_agent
			@redirect_limit = redirect_limit
			@timeout = 20 
		end
		attr_accessor :user_agent,:redirect_limit,:timeout
		
		def fetch(website)
			print "Pid:#{Process.pid},fetch: #{website}n"
			redirect,url = @redirect_limit,website
			start_time = Time.now
			redirecting = false
			begin
				begin
					uri = URI.parse(url)
					req = Net::HTTP::Get.new(uri.path)
					req.add_field('User-Agent',@user_agent)
					res = Net::HTTP.start(uri.host,uri.port) do |http|
						http.read_timeout = @timeout
						http.request(req)
					end
					if res.header['location'] # 遇到重定向,则url设定为location,再次抓取
						url = res.header['location'] 
						redirecting = true
					end
					redirect -= 1
				end while redirecting and redirect>=0
				opened_time = (Time.now - start_time).round(4) # 统计打开网站耗时
				encoding = res.body.scan(/<meta.+?charset=["'s]*([w-]+)/i)[0]
				encoding = encoding ? encoding[0].upcase : 'GB18030'
				html = 'UTF-8'==encoding ? res.body : res.body.force_encoding('GB2312'==encoding || 'GBK'==encoding ? 'GB18030' : encoding).encode('UTF-8') 
				doc = Nokogiri::HTML(html)
				processed_time = (Time.now - start_time - opened_time).round(4) # 统计分析链接耗时,1.8.7,('%.4f' % float).to_f 替换 round(4)
				[opened_time,processed_time,doc.css('a[@href]').size,res.header['server']]
			rescue =>e
				e.message  
			end
		end
	end
	
	def initialize(websites,beanstalk_jobs,pm_max=1,user_agent='',redirect_limit=1)
		@websites = websites  				# 网址数组 
		@beanstalk_jobs = beanstalk_jobs	# beanstalk服务器地址和管道参数
		@pm_max = pm_max 					# 最大并行运行进程数
		@user_agent = user_agent 			# user_agent 伪装成浏览器访问
		@redirect_limit = redirect_limit  	# 允许最大重定向次数
		
		@ipc_reader,@ipc_writer = IO.pipe # 缓存结果的 ipc 管道
	end
	
	attr_accessor :user_agent,:redirect_limit
	
	def init_beanstalk_jobs # 准备beanstalk任务
		beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
		#清空beanstalk的残留消息队列
		begin
			while job = beanstalk.reserve(0.1) 
				job.delete
			end
		rescue Beanstalk::TimedOut
			print "Beanstalk queues cleared!n"
		end
		@websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
		beanstalk.close
		rescue => e 
			puts e 
			exit
	end
	
	def process_jobs # 处理任务
		start_time = Time.now
		pm = Parallel::ForkManager.new(@pm_max)
		@pm_max.times do |i| 
			pm.start(i) and next # 启动后,立刻 next 不会等待进程执行完,这样才可以并行运算
			beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
			@ipc_reader.close	 # 关闭读取管道,子进程只返回数据
			loop{ 
				begin
					job = beanstalk.reserve(0.1) # 检测超时为0.1秒,因为任务以前提前压栈
					index = job.body
					job.delete
					website = @websites[index.to_i]
					result = Crawler.new(@user_agent).fetch(website)
					@ipc_writer.puts( ({website=>result}).to_json )
				rescue Beanstalk::DeadlineSoonError,Beanstalk::TimedOut,SystemExit,Interrupt
					break
				end
			}
			@ipc_writer.close
			pm.finish(0)	
		end
		@ipc_writer.close
		begin 
			pm.wait_all_children		# 等待所有子进程处理完毕 
		rescue SystemExit,Interrupt	# 遇到中断,打印消息
			print "Interrupt wait all children!n"
		ensure
			results = read_results
			ap results,:indent => -4,:index=>false	# 打印处理结果
			print "Process end,total: #{@websites.size},crawled: #{results.size},time: #{'%.4f' % (Time.now - start_time)}s.n"
		end
	end
	
	def read_results # 通过管道读取子进程抓取返回的数据
		results = {}
		while result = @ipc_reader.gets
			results.merge! JSON.parse(result)
		end
		@ipc_reader.close
		results
	end
	
	def run # 运行入口
		init_beanstalk_jobs
		process_jobs
	end
end

websites = %w(
http://www.51buy.com/ http://www.360buy.com/ http://www.tmall.com/ http://www.taobao.com/
http://china.alibaba.com/ http://www.paipai.com/ http://shop.qq.com/ http://www.lightinthebox.com/
http://www.amazon.cn/ http://www.newegg.com.cn/ http://www.vancl.com/ http://www.yihaodian.com/ 
http://www.dangdang.com/ http://www.m18.com/ http://www.suning.com/ http://www.hstyle.com/
)
beanstalk_jobs = [['localhost:11300'],'crawler-jobs']
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0'
pm_max = 10

MultipleCrawler.new(websites,pm_max,user_agent).run

以上内容由PHP站长网【52php.cn】收集整理供大家参考研究

如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读