python实现word图片文字分离的小例子

发布时间:2019-09-10编辑:脚本学堂
本文分享一例python代码,实现word图片与文字的分离,有需要的朋友参考学习下,希望对大家有所帮助。

本节内容:
python word图片文字分离

实现以上功能的代码,需要先装pywin32-218.win-amd64-py3.3包。

例子:
 

复制代码 代码示例:
#!/usr/bin/python
#
#site: www.jb200.com
#coding:utf-8 
from win32com import client as wc    
import os    
import glob 
word = wc.Dispatch('Word.Application')    
   
def wordsToHtml(dir):    
            #得到要处理的word后缀为doc文件列表 
            filelist1 = glob.glob(dir+'*.doc') 
            #print (filelist1) 
            for wardfullName in filelist1: 
                doc = word.Documents.Open(wardfullName)  
                htmlfullName = wardfullName[:-3]+'html' 
                txtfullName = wardfullName[:-3]+'txt' 
 
                print('正在处理图片----------'+htmlfullName) 
                print('正在处理文字----------'+txtfullName) 
             
                doc.SaveAs(htmlfullName, 10)  
                doc.SaveAs(txtfullName,5)   
 
                os.remove(htmlfullName) 
                print('正在删除html文件----------'+htmlfullName) 
                doc.Close()  
            #得到要处理的word后缀为docx文件列表 
            filelist2 = glob.glob(dir+'*.docx') 
            #print (filelist2) 
            for wardfullName in filelist2: 
                doc = word.Documents.Open(wardfullName)  
                htmlfullName = wardfullName[:-4]+'html' 
                txtfullName = wardfullName[:-4]+'txt' 
 
                print('正在处理图片----------'+htmlfullName) 
                print('正在处理文字----------'+txtfullName) 
             
                doc.SaveAs(htmlfullName, 10)  
                doc.SaveAs(txtfullName,5)   
 
                os.remove(htmlfullName) 
                print('正在删除html文件----------'+htmlfullName) 
                doc.Close()        
            word.Quit()    
      
 
if __name__ == '__main__': 
    ddir = r'F:python' 
    wordsToHtml(ddir)