Python批量将word转html，并将html内容发布至网站。

发布时间：2020-12-17 17:10:33 所属栏目：Python 来源：网络整理

导读：今天PHP站长网 52php.cn把收集自互联网的代码分享给大家，仅供参考。 #coding=utf-8__author__ = 'zhm'from win32com import client as wcimport osimport timeimport randomimport MySQLdbimport redef wordsToHtml(dir)

以下代码由PHP站长网 52php.cn收集自互联网

现在PHP站长网小编把它分享给大家，仅供参考

#coding=utf-8
__author__ = 'zhm'
from win32com import client as wc
import os
import time
import random
import MySQLdb
import re
def wordsToHtml(dir):#批量把文件夹的word文档转换成html文件
    #金山WPS调用，抢先版的用KWPS，正式版WPS
    word = wc.Dispatch('KWPS.Application')
    for path,subdirs,files in os.walk(dir):
        for wordFile in files:
            wordFullName = os.path.join(path,wordFile)
            #print "word:" + wordFullName
            doc = word.Documents.Open(wordFullName)

            wordFile2 = unicode(wordFile,"gbk")
            dotIndex = wordFile2.rfind(".")
            if(dotIndex == -1):
                print '********************ERROR: 未取得后缀名！'

            fileSuffix = wordFile2[(dotIndex + 1) : ]
            if(fileSuffix == "doc" or fileSuffix == "docx"):
                fileName = wordFile2[ : dotIndex]
                htmlName = fileName + ".html"
                htmlFullName = os.path.join(unicode(path,"gbk"),htmlName)
                # htmlFullName = unicode(path,"gbk") + "" + htmlName
                print u'生成了html文件：' + htmlFullName
                doc.SaveAs(htmlFullName,8)
                doc.Close()

    word.Quit()
    print ""
    print "Finished!"

def html_add_to_db(dir):#将转换成功的html文件批量插入数据库中。
    conn = MySQLdb.connect(
        host='localhost',port=3306,user='root',passwd='root',db='test',charset='utf8'
        )
    cur = conn.cursor()
    for path,files in os.walk(dir):
        for htmlFile in files:
            htmlFullName = os.path.join(path,htmlFile)
            title = os.path.splitext(htmlFile)[0]
            targetDir = 'D:/files/htmls/'      #D:/files为web服务器配置的静态目录
            sconds = time.time()
            msconds = sconds * 1000
            targetFile = os.path.join(targetDir,str(int(msconds))+str(random.randint(100,10000)) +'.html')
            htmlFile2 = unicode(htmlFile,"gbk")
            dotIndex = htmlFile2.rfind(".")
            if(dotIndex == -1):
                print '********************ERROR: 未取得后缀名！'

            fileSuffix = htmlFile2[(dotIndex + 1) : ]
            if(fileSuffix == "htm" or fileSuffix == "html"):
               if not os.path.exists(targetDir):
                    os.makedirs(targetDir)
               htmlFullName = os.path.join(unicode(path,htmlFullName)
               htFile = open(htmlFullName,'rb')
               #获取网页内容
               htmStrCotent = htFile.read()
               #找出里面的图片
               img=re.compile(r"""<imgs.*?s?srcs*=s*['|"]?([^s'"]+).*?>""",re.I)
               m = img.findall(htmStrCotent)
               for tagContent in m:
                   imgSrc = unicode(tagContent,"gbk")
                   imgSrcFullName = os.path.join(path,imgSrc)
                   #上传图片
                   imgTarget = 'D:/files/images/whzx/'
                   img_sconds = time.time()
                   img_msconds = sconds * 1000
                   targetImgFile = os.path.join(imgTarget,str(int(img_msconds))+str(random.randint(100,10000)) +'.png')
                   if not os.path.exists(imgTarget):
                      os.makedirs(imgTarget)
                   if not os.path.exists(targetImgFile) or(os.path.exists(targetImgFile) and (os.path.getsize(targetImgFile) != os.path.getsize(imgSrcFullName))):
                       tmpImgFile = open(imgSrcFullName,'rb')
                       tmpWriteImgFile = open(targetImgFile,"wb")
                       tmpWriteImgFile.write(tmpImgFile.read())
                       tmpImgFile.close()
                       tmpWriteImgFile.close()
                       htmStrCotent=htmStrCotent.replace(tagContent,targetImgFile.split(":")[1])
               if not os.path.exists(targetFile) or(os.path.exists(targetFile) and (os.path.getsize(targetFile) != os.path.getsize(htmlFullName))):
                   #用iframe包装转换好的html文件。
                    iframeHtml='''
                    <script type="text/javascript" language="javascript">
                        function iFrameHeight() {
                            var ifm= document.getElementById("iframepage");
                            var subWeb = document.frames ? document.frames["iframepage"].document:ifm.contentDocument;
                            if(ifm != null && subWeb != null) {
                                ifm.height = subWeb.body.scrollHeight;
                            }
                        }
                    </script>
                    <iframe src='''+targetFile.split(':')[1]+'''
                     marginheight="0" marginwidth="0" frameborder="0" scrolling="no" width="765" height=100% id="iframepage" name="iframepage" onLoad="iFrameHeight()" ></iframe>
                    '''
                    tmpTargetFile = open(targetFile,"wb")
                    tmpTargetFile.write(htmStrCotent)
                    tmpTargetFile.close()
                    htFile.close()
                    try:
                        # 执行
                        sql = "insert into common_article(title,content) values(%s,%s)"
                        param = (unicode(title,iframeHtml)
                        cur.execute(sql,param)
                    except:
                        print "Error: unable to insert data"
    cur.close()
    conn.commit()
    # 关闭数据库连接
    conn.close()
if __name__ == '__main__':
    wordsToHtml('d:/word')
    html_add_to_db('d:/word')

以上内容由PHP站长网【52php.cn】收集整理供大家参考研究

如果以上内容对您有帮助，欢迎收藏、点赞、推荐、分享。

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!