【2】文件读写-2-8-python读写docx

一、常规用法

安装:

pip install python-docx

1.1 读取内容

读取段落

import docx

docStr = Document(docName)   #打开文档
for paragraph in docStr.paragraphs:
	parStr = paragraph.text  #每个段落的内容	
	paragraph.style.name == 'Heading 1'  #一级标题   
	paragraph.paragraph_format.alignment == 1  #居中显示
	paragraph.style.next_paragraph_style.paragraph_format.alignment == 1  #下一段居中显示
	paragraph.style.font.color

读取表格

numTables = docStr.tables

for table in numTables: row_count = len(table.rows) col_count = len(table.columns) for i in range(row_count): row = table.rows[i].cells #i行j列内容:row[j].text

#或者:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
for j in range(col_count):

  print(table.cell(i,j).text)

读取图片

import xml.etree.ElementTree as ET
from PIL import Image
from docx import Document

def hasImage(par):
		"""get all of the images in a paragraph 
		:param par: a paragraph object from docx
		:return: a list of r:embed 
		"""
		ids = []
		root = ET.fromstring(par._p.xml.encode('utf-8'))
		namespace = {
						'a':"http://schemas.openxmlformats.org/drawingml/2006/main", \
						'r':"http://schemas.openxmlformats.org/officeDocument/2006/relationships", \
						'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}

		inlines = root.findall('.//wp:inline',namespace)
		for inline in inlines:
				imgs = inline.findall('.//a:blip', namespace)
				for img in imgs:     
						id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
				ids.append(id)

		return ids

document = Document(one_word_fp)
for one_p in document.paragraphs
img_ids = hasImage(one_p)
if len(img_ids) != 0:
	for one_id in img_ids:
		# print(img_ids)
		document_part = document.part
		image_part = document_part.related_parts[one_id]
		
		img_name = 'test.png'
		fr = open(img_name, "wb")
		fr.write(image_part._blob)
		fr.close()

		#因为IE只支持RGB模式的图片,转换一下
		image = Image.open(img_name)		
		image = image.convert('RGB')
		image.save(img_name)

1.2 写word

#coding=utf-8

from docx import Document
from docx.shared import Pt
from docx.shared import Inches
from docx.oxml.ns import qn
#打开文档
document = Document()
#加入不同等级的标题
document.add_heading(u'MS WORD写入测试',0)
document.add_heading(u'一级标题',1)
document.add_heading(u'二级标题',2)
#添加文本
paragraph = document.add_paragraph(u'我们在做文本测试!')
#设置字号
run = paragraph.add_run(u'设置字号、')
run.font.size = Pt(24)

#设置字体
run = paragraph.add_run('Set Font,')
run.font.name = 'Consolas'

#设置中文字体
run = paragraph.add_run(u'设置中文字体、')
run.font.name=u'宋体'
r = run._element
r.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')

#设置斜体
run = paragraph.add_run(u'斜体、')
run.italic = True

#设置粗体
run = paragraph.add_run(u'粗体').bold = True

#增加引用
document.add_paragraph('Intense quote', style='Intense Quote')

#增加无序列表
document.add_paragraph(
		u'无序列表元素1', style='List Bullet'
)
document.add_paragraph(
		u'无序列表元素2', style='List Bullet'
)
#增加有序列表
document.add_paragraph(
		u'有序列表元素1', style='List Number'
)
document.add_paragraph(
		u'有序列表元素2', style='List Number'
)
#增加图像(此处用到图像image.bmp,请自行添加脚本所在目录中)
document.add_picture('image.bmp', width=Inches(1.25))

#增加表格
table = document.add_table(rows=1, cols=3)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Name'
hdr_cells[1].text = 'Id'
hdr_cells[2].text = 'Desc'
#再增加3行表格元素
for i in xrange(3):
		row_cells = table.add_row().cells
		row_cells[0].text = 'test'+str(i)
		row_cells[1].text = str(i)
		row_cells[2].text = 'desc'+str(i)

#增加分页
document.add_page_break()

#保存文件
document.save(u'测试.docx')

三、讨论

3.1 写入颜色:

from docx import Document
from docx.shared import RGBColor
document = Document()
run = document.add_paragraph().add_run('some text')
font = run.font
font.color.rgb = RGBColor(0x42, 0x24, 0xE9)
p=document.add_paragraph('aaa')
document.save('demo1.docx')

四、我的应用:

4.1 获取段落的颜色

from docx import Document

document = Document(one_word_fp)
for one_para in document.paragraphs: #遍历每一个但罗
		para_colors = []
		for n in one_para.runs: 
			rgb_color = str(n.font.color.rgb)
			para_colors.append(rgb_color)

4.2 doc to docx

yum list LibreOffi
yum install libreoffice.x86_64

import subprocess
output = subprocess.check_output(["soffice","--headless","--invisible","--convert-to","docx",one_file_fp,"--outdir","/data/user/sam/project/drug_news/1.analysis"])

参考资料:

药企,独角兽,苏州。团队长期招人,感兴趣的都可以发邮件聊聊:tiehan@sina.cn
个人公众号,比较懒,很少更新,可以在上面提问题,如果回复不及时,可发邮件给我: tiehan@sina.cn