一个检测某网页依赖第三方资源的 python 脚本
发布时间:2020-12-17 17:25:36 所属栏目:Python 来源:网络整理
导读:今天PHP站长网 52php.cn把收集自互联网的代码分享给大家,仅供参考。 #!/usr/bin/env python# -*- coding: utf8 -*-# 通过输入的网址获取其依赖的站点(html中引用到的)# 依赖文件格式如下:# *.microsoft.com# *.outloo
以下代码由PHP站长网 52php.cn收集自互联网 现在PHP站长网小编把它分享给大家,仅供参考 #!/usr/bin/env python # -*- coding: utf8 -*- # 通过输入的网址获取其依赖的站点(html中引用到的) # 依赖文件格式如下: # *.microsoft.com # *.outlook.com # *.apple.com # *.ibm.com import urllib2 import urlparse import socket import sys import re def printHelp(): print 'Approach 1: python DepSpy.py url dstfile' print ' * url starts with http:// or https://.' print ' * dstfile is the full name of output file,' print ' results output to stdin if dstfile is empty.' print 'rnApproach 2: python DepSpy.py urlfile dstfile' print ' * urlfile is the full name of file listing input urls(splitted by n).' print ' * dstfile is the full name of output file,' print ' results output to stdin if dstfile is empty.' # 根据命令行调用相应功能 def dispatch(args): try: if len(args) < 2: printHelp() return [] elif len(args) == 2 and (['h','/h','-h','?','/?','-?','help','-help','/help'].count(args[1]) != 0): printHelp() elif args[1].find(r'http://') == 0 or args[1].find(r'https://') == 0: # 命令行参数为一个网址 return getDependHost(args[1]) else: # 命令行参数为一个网址列表文件名 urls = readURLList(args[1]) ret = [] for u in urls: print'---- Dealing with: ' + u + ' ----' lst = getDependHost(u) for it in lst: if ret.count(it) == 0: ret.append(it) return ret except Exception,e: print e return [] # 获取依赖站点 _pattern = re.compile(r'<(?:script|link).*(?:src|href)s?=s?"(https?://.+?)"') _pwww = re.compile(r'^[a-z0-9-_]+.') def getDependHost(url): try: if url.find('http://') != 0: url = 'http://' + url def getHost(str): netloc = urlparse.urlparse(str).netloc if netloc.find('baidu.com') != -1: # 百度的网址要单独处理 return netloc elif netloc.count('.') < 2: return '*.' + netloc else: netloc,dummy = re.subn(_pwww,'*.',netloc) return netloc resp = urllib2.urlopen(url) html = resp.read() deps = _pattern.findall(html) deps = map(getHost,deps) selfHost = getHost(url) ret = [] for it in deps: if ret.count(it) == 0 and selfHost != it: ret.append(it) print ret return ret except Exception,e: print e return [] # 读取网址列表 def readURLList(path): fp = open(path,'r') urls = [] try: urls = fp.read().replace('r','').replace('*','www').split('n') finally: fp.close() return urls # 程序入口 if __name__ == '__main__': socket.setdefaulttimeout(60) # 全局超时设置 lst = dispatch(sys.argv) if len(sys.argv) > 2: try: distFilename = sys.argv[2] fp = open(distFilename,'w') for it in lst: fp.write(it + 'rn') fp.close() except Exception,e: print 'Write File Error' else: try: for it in lst: print it except Exception,e: print 'Error' 以上内容由PHP站长网【52php.cn】收集整理供大家参考研究 如果以上内容对您有帮助,欢迎收藏、点赞、推荐、分享。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |