基于xpath选择器、PyQuery、正则表达式的格式清理工具详解

1，使用xpath清理不必要的标签元素，以及无内容标签

from lxml import etree

def xpath_clean(self, text: str, xpath_dict: dict) -> str:

'''

xpath 清除不必要的元素

:param text: html_content

:param xpath_dict: 清除目标xpath

:return: string type html_content

'''

remove_by_xpath = xpath_dict if xpath_dict else dict()

# 必然清除的项目除非极端情况一般这些都是要清除的

remove_by_xpath.update({

'_remove_2': '//iframe',

'_remove_4': '//button',

'_remove_5': '//form',

'_remove_6': '//input',

'_remove_7': '//select',

'_remove_8': '//option',

'_remove_9': '//textarea',

'_remove_10': '//figure',

'_remove_11': '//figcaption',

'_remove_12': '//frame',

'_remove_13': '//video',

'_remove_14': '//script',

'_remove_15': '//style'

})

parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)

selector = etree.HTML(text, parser=parser)

# 常规删除操作，不需要的标签删除

for xpath in remove_by_xpath.values():

for bad in selector.xpath(xpath):

bad_string = etree.tostring(bad, encoding='utf-8',

pretty_print=True).decode()

logger.debug(f"clean article content : {bad_string}")

bad.getparent().remove(bad)

skip_tip = "name()='img' or name()='tr' or " \\

"name()='th' or name()='tbody' or " \\

"name()='thead' or name()='table'"

# 判断所有p标签，是否有内容存在，没有的直接删除

for p in selector.xpath(f"//*[not({skip_tip})]"):

# 跳过逻辑

if p.xpath(f".//*[{skip_tip}]") or \\

bool(re.sub('\\s', '', p.xpath('string(.)'))):

continue

bad_p = etree.tostring(p, encoding='utf-8',

pretty_print=True).decode()

logger.debug(f"clean p tag : {bad_p}")

p.getparent().remove(p)

return etree.tostring(selector, encoding='utf-8',

pretty_print=True).decode()

2，使用pyquery清理标签属性，并返回处理后源码和纯净文本

#!/usr/bin/env python

# -*-coding:utf-8-*-

from pyquery import PyQuery as pq

def pyquery_clean(self, text, url, pq_dict) -> object:

'''

pyquery 做出必要的处理，

:param text:

:param url:

:param pq_dict:

:return:

'''

# 删除pq表达式字典

remove_by_pq = pq_dict if pq_dict else dict()

# 标签属性白名单

attr_white_list = ['rowspan', 'colspan']

# 图片链接key

img_key_list = ['src', 'data-echo', 'data-src', 'data-original']

# 生成pyquery对象

dom = pq(text)

# 删除无用标签

for bad_tag in remove_by_pq.values():

for bad in dom(bad_tag):

bad_string = pq(bad).html()

logger.debug(f"clean article content : {bad_string}")

dom.remove(bad_tag)

# 标签各个属性处理

for tag in dom('*'):

for key, value in tag.attrib.items():

# 跳过逻辑，保留表格的rowspan和colspan属性

if key in attr_white_list:

continue

# 处理图片链接，不完整url，补充完整后替换

if key in img_key_list:

img_url = self.absolute_url(url, value)

pq(tag).remove_attr(key)

pq(tag).attr('src', img_url)

pq(tag).attr('alt', '')

# img标签的alt属性保留为空

elif key == 'alt':

pq(tag).attr(key, '')

# 其余所有属性做删除操作

else:

pq(tag).remove_attr(key)

return dom.text(), dom.html()

3，正则表达清理空格以及换行符内容

#!/usr/bin/env python

# -*-coding:utf-8-*-

import re

def regular_clean(self, str1: str, str2: str):

'''

正则表达式处理数据格式

:param str1: content

:param str2: html_content

:return: 返回处理后的结果

'''

def new_line(text):

text = re.sub('<br\\s?/?>', ' ', text)

text = re.sub(

'</?a>|</?em>|</?html>|</?body>|'

'</?head>|<[a-zA-Z]{1,10}\\s?/>|'

'</?strong>|</?blockquote>|</?b>|'

'</?span>|</?i>|</?hr>|</?font>',

'',

text)

text = re.sub('\\n', '', text)

text = re.sub('<h[1-6]>', '', text)

text = re.sub('</h[1-6]>', '', text)

text = text.replace('', '\\n').replace(' ', ' ')

return text

str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 处理空白行问题

# TODO html_content处理 1，删除多余的无法使用的标签以及影响数据展示的标签 2，换行符问题处理以及更换

str2 = new_line(text=str2)

return str1, str2

结尾部分，各个方法封装类代码展示

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

#!/usr/bin/env python

# -*-coding:utf-8-*-

'''

author: szhan

date：2020-08-17

summery: 清理html_conent以及获取纯净数据格式

'''

import re

from lxml import etree

from pyquery import PyQuery as pq

from urllib.parse import urlsplit, urljoin

from loguru import logger

class CleanArticle:

def __init__(

self,

text: str,

url: str = '',

xpath_dict: dict = None,

pq_dict: dict = None

):

self.text = text

self.url = url

self.xpath_dict = xpath_dict or dict()

self.pq_dict = pq_dict or dict()

@staticmethod

def absolute_url(baseurl: str, url: str) -> str:

'''

补充url

:param baseurl:scheme url

:param url: target url

:return: complete url

'''

target_url = url if urlsplit(url).scheme else urljoin(baseurl, url)

return target_url

@staticmethod

def clean_blank(text):

'''

空白处理

:param text:

:return:

'''

text = text.replace(' ', '').replace('\\u3000', '').replace('\\t', '').replace('\\xa0', '')

text = re.sub('\\s{2,}', '', text)

text = re.sub('\\n{2,}', '\\n', text)

text = text.strip('\\n').strip()

return text

def run(self):

'''

:return:处理后的content, html_content

'''

if (not bool(self.text)) or (not isinstance(self.text, str)):

raise ValueError('html_content has a bad type value')

# 首先，使用xpath去除空格，以及注释，iframe, button, form, script, style, video等标签

text = self.xpath_clean(self.text, self.xpath_dict)

# 第二步，使用pyquery处理具体细节方面

str1, str2 = self.pyquery_clean(text, self.url, self.pq_dict)

# 最终的正则处理

content, html_content = self.regular_clean(str1, str2)

return content, html_content

def xpath_clean(self, text: str, xpath_dict: dict) -> str:

'''

xpath 清除不必要的元素

:param text: html_content

:param xpath_dict: 清除目标xpath

:return: string type html_content

'''

remove_by_xpath = xpath_dict if xpath_dict else dict()

# 必然清除的项目除非极端情况一般这些都是要清除的

remove_by_xpath.update({

'_remove_2': '//iframe',

'_remove_4': '//button',

'_remove_5': '//form',

'_remove_6': '//input',

'_remove_7': '//select',

'_remove_8': '//option',

'_remove_9': '//textarea',

'_remove_10': '//figure',

'_remove_11': '//figcaption',

'_remove_12': '//frame',

'_remove_13': '//video',

'_remove_14': '//script',

'_remove_15': '//style'

})

parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)

selector = etree.HTML(text, parser=parser)

# 常规删除操作，不需要的标签删除

for xpath in remove_by_xpath.values():

for bad in selector.xpath(xpath):

bad_string = etree.tostring(bad, encoding='utf-8',

pretty_print=True).decode()

logger.debug(f"clean article content : {bad_string}")

bad.getparent().remove(bad)

skip_tip = "name()='img' or name()='tr' or " \\

"name()='th' or name()='tbody' or " \\

"name()='thead' or name()='table'"

# 判断所有p标签，是否有内容存在，没有的直接删除

for p in selector.xpath(f"//*[not({skip_tip})]"):

# 跳过逻辑

if p.xpath(f".//*[{skip_tip}]") or \\

bool(re.sub('\\s', '', p.xpath('string(.)'))):

continue

bad_p = etree.tostring(p, encoding='utf-8',

pretty_print=True).decode()

logger.debug(f"clean p tag : {bad_p}")

p.getparent().remove(p)

return etree.tostring(selector, encoding='utf-8',

pretty_print=True).decode()

def pyquery_clean(self, text, url, pq_dict) -> object:

'''

pyquery 做出必要的处理，

:param text:

:param url:

:param pq_dict:

:return:

'''

# 删除pq表达式字典

remove_by_pq = pq_dict if pq_dict else dict()

# 标签属性白名单

attr_white_list = ['rowspan', 'colspan']

# 图片链接key

img_key_list = ['src', 'data-echo', 'data-src', 'data-original']

# 生成pyquery对象

dom = pq(text)

# 删除无用标签

for bad_tag in remove_by_pq.values():

for bad in dom(bad_tag):

bad_string = pq(bad).html()

logger.debug(f"clean article content : {bad_string}")

dom.remove(bad_tag)

# 标签各个属性处理

for tag in dom('*'):

for key, value in tag.attrib.items():

# 跳过逻辑，保留表格的rowspan和colspan属性

if key in attr_white_list:

continue

# 处理图片链接，不完整url，补充完整后替换

if key in img_key_list:

img_url = self.absolute_url(url, value)

pq(tag).remove_attr(key)

pq(tag).attr('src', img_url)

pq(tag).attr('alt', '')

# img标签的alt属性保留为空

elif key == 'alt':

pq(tag).attr(key, '')

# 其余所有属性做删除操作

else:

pq(tag).remove_attr(key)

return dom.text(), dom.html()

def regular_clean(self, str1: str, str2: str):

'''

正则表达式处理数据格式

:param str1: content

:param str2: html_content

:return: 返回处理后的结果

'''

def new_line(text):

text = re.sub('<br\\s?/?>', ' ', text)

text = re.sub(

'</?a>|</?em>|</?html>|</?body>|'

'</?head>|<[a-zA-Z]{1,10}\\s?/>|'

'</?strong>|</?blockquote>|</?b>|'

'</?span>|</?i>|</?hr>|</?font>',

'',

text)

text = re.sub('\\n', '', text)

text = re.sub('<h[1-6]>', '', text)

text = re.sub('</h[1-6]>', '', text)

text = text.replace('', '\\n').replace(' ', ' ')

return text

str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 处理空白行问题

# TODO html_content处理 1，删除多余的无法使用的标签以及影响数据展示的标签 2，换行符问题处理以及更换

str2 = new_line(text=str2)

return str1, str2

if __name__ == '__main__':

with open('html_content.html', 'r', encoding='utf-8') as f:

lines = f.readlines()

html = ''

for line in lines:

html += line

ca = CleanArticle(text=html)

_, html_content = ca.run()

print(html_content)

总结

到此这篇关于基于xpath 选择器、PyQuery、正则表达式的格式清理工具详解的文章就介绍到这了,更多相关PyQuery、正则表达式的格式清理工具内容请搜索快网idc以前的文章或继续浏览下面的相关文章希望大家以后多多支持快网idc！

相关文章

微信

快网idc优惠网

QQ交流群