加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 大数据 > 正文

VB分析超过64K的网页内容(基于XMLHTTP和字节数组处理)

发布时间:2020-12-16 23:02:58 所属栏目:大数据 来源:网络整理
导读:Visual Basic Code '**************************************************************************************************** ' 'Name..........WEBPageReadProgram 'File..........WEBRead.frm 'Version.......1.0.0 'Dependencies..XMLHTTPObject 'Descr

Visual Basic Code
'****************************************************************************************************
'
'Name..........WEBPageReadProgram
'File..........WEBRead.frm
'Version.......1.0.0
'Dependencies..XMLHTTPObject
'Description...DynamicreadURLhtmldata
'Author........ZhouWenXing
'Date..........Nov,5nd2010
'CSDNAccounts..SupermanKing
'
'Copyright(c)2008bywww.rljy.com
'LiuZhoucity,China
'
'****************************************************************************************************
'====================================================================================================
'APIfunctiondefining(API函数定义)
'====================================================================================================
Private Declare Sub CopyMemory Lib "kernel32" Alias "RtlMoveMemory" ( _
Destination As Any,_
Source As Any,_
ByVal Length As Long)

'====================================================================================================
'Formeventdisposeprocess(窗体基本的事件处理过程)
'====================================================================================================
'====================点击按钮1产生的事件====================
Private Sub Command1_Click()
'====================变量定义====================
Dim strTemp As String' 临时字符串变量
Dim strUserList As String' 最终拼合用户列表的变量
Dim strSearch As String' 搜索关键内容的字符串变量
Dim lngSearchSize As Long' 搜索关键内容的字符串大小
Dim lngStart As Long' 搜索用户字符串时存储开始位置的变量
Dim lngEnd As Long' 搜索用户字符串时存储结束位置的变量
Dim ComXMLHTTP As Object' 访问网页的 XMLHTTP 对象
Dim byteHTML() As Byte' 存储网页内容的字节流数组变量

On Error Resume Next' 开始设置错误陷阱,防止程序发生意外错误而崩溃
'====================初始化变量====================
strUserList = ""
strSearch = "class=""dropmenu"" onmouSEOver=""showMenu(this.id)"">"
lngSearchSize = LenB(StrConv(strSearch,vbFromUnicode))

'====================开始下载指定URL的数据内容====================
Set ComXMLHTTP = CreateObject("Microsoft.XMLHTTP")'初始化 XMLHTTP 对象
If Err.Number <> 0 Then
MsgBox "错误:" & Err.Number & "," & Err.Description
Err.Clear
Exit Sub
End If
ComXMLHTTP.Open "GET","http://bbs.duowan.com/thread-17408898-2-1.html",False'设置访问方式和URL地址
ComXMLHTTP.setRequestHeader "CONTENT-TYPE","application/x-www-form-urlencoded"'向HTTP头加入参数
ComXMLHTTP.Send'提交HTTP请求
If Err.Number <> 0 Then
MsgBox "错误:" & Err.Number & "," & Err.Description
Err.Clear
Exit Sub
End If
'----------判断下载是否成功----------
If ComXMLHTTP.Status <> 200 Then
MsgBox "访问URL失败,请您确定。",64, "提示"
Exit Sub
End If
'====================下载URL的数据完成后将数据读入字节数组中====================
'----------将数据读入byteHTML这个字节数组中----------
'因为该网页原来是UTF-8编码,所以取得的数据也就是UTF-8的编码数据
byteHTML = ComXMLHTTP.ResponseBody
Call SaveTextFile("c:/UTF-8.txt",byteHTML,"UTF-8")' 保存原始数据到磁盘,可以验证数据的完整性

'----------将UTF-8编码的字节数组转换成Unicode编码的字节数组----------
byteHTML = UTF8ToUnicode(byteHTML)
Call SaveTextFile("c:/Unicode.txt","Unicode")' 保存转换 Unicode 后的数据到磁盘,可以验证数据的完整性

'----------将Unicode编码的字节数组转换成GB2312编码的字节数组----------
'其转换目的是方便用GB2312的字符串查找数据,当然直接用Unicode也是可以实现的
byteHTML = UnicodeToGB2312(byteHTML)
Call SaveTextFile("c:/GB2312.txt",byteHTML)' 保存转换 GB2312 后的数据到磁盘,可以验证数据的完整性


'====================得到完整的GB2312编码数组数据后,开始分析网页内容====================
'第一个找到的被忽略,因为这个不是所需的内容
lngStart = InStr_Array(0,strSearch)
'如果一个都找不到,就没必要继续下去了
If lngStart >= 0 Then
lngStart = lngStart + lngSearchSize
'----------开始循环查找所有用户内容----------
Do
'这里开始才是要找的东西位置
lngStart = InStr_Array(lngStart,strSearch)
If lngStart >= 0 Then
lngStart = lngStart + lngSearchSize
lngEnd = InStr_Array(lngStart,"")
strTemp = Mid_Array(byteHTML,lngStart,lngEnd - lngStart)
lngStart = lngEnd
strUserList = strUserList & strTemp & vbCrLf
End If
Loop While lngStart >= 0
End If
'====================完成工作将用户信息合并内容输出到文本框====================
Text1.Text = strUserList
End Sub

'====================================================================================================
'Userintheclasscustom'sfuntiondisposeprocess(自定义函数及处理过程)
'====================================================================================================
'----------------------------------------------------------------------------------------------------
'FunctionName:UTF8ToUnicode
'InputParameter:funUTF8(ByteArray)-TheUTF-8'sbytearray
'ReturnValue:(ByteArray)-ReturnUnicode'sbytearray
'Description:VisualBasiccompile'sconversiontheUTF-8toUnicodedisposeprocess
'Author:SupermanKing
'----------------------------------------------------------------------------------------------------
Function UTF8ToUnicode(ByRef funUTF8() As Byte) As Byte()
'====================变量定义====================
Dim lngLength As Long
Dim lngLengthB As Long
Dim lngUTF8Char As Long
Dim intWChar As Integer
Dim byteTemp As Byte
Dim byteBit As Byte
Dim byteUnicode() As Byte
Dim lngUTF8Count As Long
Dim i As Long
Dim j As Long

On Error Resume Next' 开始设置错误陷阱,防止程序发生意外错误而崩溃
'====================初始化变量====================
lngLengthB = 0

'====================校验输入参数====================
lngLength = UBound(funUTF8) + 1
If Err.Number <> 0 Then
Err.Clear
Exit Function
End If

'====================开始循环处理编码转换过程====================
For i = 0 To lngLength - 1
'--------------------根据UTF-8编码规则数UTF-8字符的存储个数--------------------
lngUTF8Count = 0
byteTemp = funUTF8(i)
For j = 1 To 7
byteBit = Int(byteTemp / (2 ^ (8 - j)))'二进制位向右偏移 (8 - j) 个二进制位
byteBit = byteBit And 1'取最后一个二进制位值
If byteBit = 1 Then
lngUTF8Count = lngUTF8Count + 1
Else
'碰到0就结束数字符数操作
Exit For
End If
Next j

'--------------------判断编码内存储的内容是否是经过编码的--------------------
If lngUTF8Count < 2 Or lngUTF8Count > 3 Then
'----------没有经过UTF-8格式编码,直接转换成Unicode编码----------
If lngLengthB = 0 Then
lngLengthB = 2
ReDim byteUnicode(lngLengthB - 1)
Else
lngLengthB = lngLengthB + 2
ReDim Preserve byteUnicode(lngLengthB - 1)
End If
byteUnicode(lngLengthB - 2) = byteTemp
Else
'----------经过UTF-8格式编码,先读出内容后再转换成Unicode编码----------
'读出这几个UTF-8字节内容
For j = 0 To lngUTF8Count - 1
byteTemp = funUTF8(i + j)
If j = 0 Then
'第一个UTF-8编码含编码字节信息,所以取存储信息特别点
byteTemp = byteTemp And ((2 ^ (8 - (lngUTF8Count + 1))) - 1)
lngUTF8Char = byteTemp
Else
'后面的只取6个二进制位
byteTemp = byteTemp And &H3F
lngUTF8Char = lngUTF8Char * &H40'向左偏移6位好存储后面的6位数据
lngUTF8Char = lngUTF8Char Or byteTemp'将低6位的数据补充到编码中
End If
Next j
'已经取出Unicode编码到lngUTF8Char里
If lngLengthB = 0 Then
lngLengthB = 2
ReDim byteUnicode(lngLengthB - 1)
Else
lngLengthB = lngLengthB + 2
ReDim Preserve byteUnicode(lngLengthB - 1)
End If
byteUnicode(lngLengthB - 2) = lngUTF8Char And 255
byteUnicode(lngLengthB - 1) = Int(lngUTF8Char / (2 ^ 8)) And 255
i = i + (lngUTF8Count - 1)
End If
If i > (lngLength - 1) Then
Exit For
End If
Next i

'====================完成编码转换过程,返回数据====================
UTF8ToUnicode = byteUnicode
End Function

'----------------------------------------------------------------------------------------------------
'FunctionName:UnicodeToGB2312
'InputParameter:funUnicode(ByteArray)-TheUnicode'sbytearray
'ReturnValue:(ByteArray)-ReturnGB2312'sbytearray
'Description:VisualBasiccompile'sconversiontheUnicodetoGB2312disposeprocess
'Author:SupermanKing
'----------------------------------------------------------------------------------------------------
Function UnicodeToGB2312(ByRef funUnicode() As Byte) As Byte()
'====================变量定义====================
Dim lngLength As Long
Dim lngLengthB As Long
Dim byteGB2312() As Byte
Dim i As Long
Dim intWChar As Integer
Dim intChar As Integer

On Error Resume Next' 开始设置错误陷阱,防止程序发生意外错误而崩溃
'====================初始化变量====================
lngLengthB = 0

'====================校验输入参数====================
lngLength = UBound(funUnicode) + 1
If Err.Number <> 0 Then
Err.Clear
Exit Function
End If
lngLength = lngLength / 2

'====================开始循环处理编码转换过程====================
For i = 0 To lngLength - 1
CopyMemory intWChar,funUnicode(i * 2),2
intChar = Asc(StrConv(ChrW(intWChar),vbNarrow))
If intChar < 0 Or intChar > 255 Then
If lngLengthB = 0 Then
lngLengthB = 2
ReDim byteGB2312(lngLengthB - 1)
byteGB2312(lngLengthB - 1) = intChar And 255
byteGB2312(lngLengthB - 2) = Int(CLng("&H" & Hex(intChar)) / (2 ^ 8)) And 255
Else
lngLengthB = lngLengthB + 2
ReDim Preserve byteGB2312(lngLengthB - 1)
byteGB2312(lngLengthB - 1) = intChar And 255
byteGB2312(lngLengthB - 2) = Int(CLng("&H" & Hex(intChar)) / (2 ^ 8)) And 255
End If
Else
If lngLengthB = 0 Then
lngLengthB = 1
ReDim byteGB2312(lngLengthB - 1)
byteGB2312(lngLengthB - 1) = CByte(intChar)
Else
lngLengthB = lngLengthB + 1
ReDim Preserve byteGB2312(lngLengthB - 1)
byteGB2312(lngLengthB - 1) = CByte(intChar)
End If
End If
Next i

'====================完成编码转换过程,返回数据====================
UnicodeToGB2312 = byteGB2312
End Function

'----------------------------------------------------------------------------------------------------
'FunctionName:InStr_Array
'InputParameter:funStart(Long)-Searchthebytearraystart'saddress
':funBytes(ByteArray)-Wantsearchdata'sbytearray
':funFind(String)-Search'squalification
'ReturnValue:(Long)-Findqualification'saddress
'Description:ImitateInStrfunction'sdisposeprocess
'Author:SupermanKing
'----------------------------------------------------------------------------------------------------
Function InStr_Array(ByVal funStart As Long,_
ByRef funBytes() As Byte,_
ByVal funFind As String) As Long
'====================变量定义====================
Dim byteFindArray() As Byte
Dim lngBytesCount As Long
Dim lngFindCount As Long
Dim lngIsFind As Long
Dim i As Long
Dim j As Long

On Error Resume Next' 开始设置错误陷阱,防止程序发生意外错误而崩溃
'====================初始化变量====================
InStr_Array = -1

'====================校验输入参数====================
'----------校验搜索条件参数----------
If Len(funFind) = 0 Then
Exit Function
End If
'----------校验搜索内容参数----------
lngBytesCount = UBound(funBytes)
If Err.Number <> 0 Then
Err.Clear
Exit Function
End If
byteFindArray = StrConv(funFind,vbFromUnicode)
lngFindCount = UBound(byteFindArray)
'----------校验搜索位置参数----------
If funStart + lngFindCount > lngBytesCount Then
Exit Function
End If

'====================开始搜索数据====================
For i = funStart To lngBytesCount
lngIsFind = 1
For j = 0 To lngFindCount
If funBytes(i + j) < &HA0 And byteFindArray(j) < &HA0 Then
If UCase(Chr(funBytes(i + j))) <> UCase(Chr(byteFindArray(j))) Then
lngIsFind = 0
Exit For
End If
Else
If funBytes(i + j) <> byteFindArray(j) Then
lngIsFind = 0
Exit For
End If
End If
Next j
If lngIsFind = 1 Then
InStr_Array = i
Exit For
End If
Next i
End Function

'----------------------------------------------------------------------------------------------------
'FunctionName:Mid_Array
'InputParameter:funBytes(ByteArray)-Wantgetdata'sbytearray
':funStart(Long)-Wantgetdata'sstartaddress
':funCount(Long)-Wantgetdata'ssize
'ReturnValue:(String)-Returnwantgetstring
'Description:ImitateMidfunction'sdisposeprocess
'Author:SupermanKing
'----------------------------------------------------------------------------------------------------
Function Mid_Array(ByRef funBytes() As Byte,_
ByVal funStart As Long,_
ByVal funCount As Long) As String
'====================变量定义====================
Dim byteRead() As Byte
Dim lngBytesCount As Long

On Error Resume Next' 开始设置错误陷阱,防止程序发生意外错误而崩溃
'====================初始化变量====================
Mid_Array = ""

'====================校验输入参数====================
lngBytesCount = UBound(funBytes)
If Err.Number <> 0 Then
Err.Clear
Exit Function
End If
If funStart + funCount > lngBytesCount Then
Exit Function
End If

'====================开始取指定数据内容====================
ReDim byteRead(funCount - 1)
CopyMemory byteRead(0),funBytes(funStart),funCount
Mid_Array = StrConv(byteRead,vbUnicode)
End Function

'----------------------------------------------------------------------------------------------------
'FunctionName:SaveTextFile
'InputParameter:funFileName(String)-Savefile'spath
':funBytes(ByteArray)-Savefile'sdata
':funMode(String)-Datacodeingmode
'ReturnValue:(void)
'Description:Save.txtfiledisposeprocess
'Author:SupermanKing
'----------------------------------------------------------------------------------------------------
Sub SaveTextFile(ByVal funFileName As String,_
Optional ByVal funMode As String = "GB2312")
'====================变量定义====================
Dim fs As Integer

On Error Resume Next' 开始设置错误陷阱,防止程序发生意外错误而崩溃
'====================校验输入参数====================
'判断给定文件地址是否可读写,同时也可进行文件数据初始化操作
fs = FreeFile
Open funFileName For Output As #fs
If Err.Number <> 0 Then
MsgBox "错误:" & Err.Number & "," & Err.Description,16, "错误"
Err.Clear
Exit Sub
End If
Close #fs

'====================开始写文件数据====================
fs = FreeFile
Open funFileName For Binary As #fs
'根据编码模式来写TXT文件头,这样可让Windows记事本识别该文件的编码方式
Select Case UCase(funMode)
Case "GB2312":'输出 GB2312 编码的文本文件
Put #1,1, funBytes

Case "UNICODE":'输出 Unicode 编码的文本文件
Put #1, CByte(&HFF)
Put #1,2, CByte(&HFE)
Put #1,3, funBytes

Case "UTF-8":'输出 UTF-8 编码的文本文件
Put #1, CByte(&HEF)
Put #1, CByte(&HBB)
Put #1, CByte(&HBF)
Put #1,4, funBytes
End Select
Close #fs
End Sub

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读