#!/usr/bin/env python
#-*- coding:UTF-8 -*-
#####################################################
# Author: sunfx xingrhce@163.com
# Last modified: 2014/11/12 - 2014/11/13
# Filename: re.py
# Q Q 群: 236147801
#####################################################
import re
#1.查找文本中的字符
pattern = ‘this‘
text = ‘Does this text match the pattern?‘
match = re.search(pattern,text)
s = match.start()
e = match.end()
print ‘Found "%s"\nin "%s"\nfrom %d to %d ("%s")‘ % (match.re.pattern,match.string,s,e,text[s:e])
‘‘‘
match.re.pattern 要匹配的内容
match.string 匹配的字符
s 匹配到内容开始索引
d 匹配到内容结束索引
text[s:e] 匹配字符
‘‘‘
#2.编译表达式
regexes = [ re.compile(p)
for p in [‘this‘,‘that‘]
] #把字符转换Regexobject格式
print ‘Text: %r\n‘ % text #输出text内容
for regex in regexes:
print ‘Seeking "%s"->‘ % regex.pattern, #regex.pattern 要匹配的字符
if regex.search(text): #在text中搜索this or that
print ‘match!‘
else:
print ‘no match‘
#3.多重匹配
text = ‘abbaaabbbbaaaaa‘
pattern = ‘ab‘
for match in re.findall(pattern,text):
print ‘Found: "%s"‘ % match
#findall 直接返回字符串
for match in re.finditer(pattern,text):
s = match.start()
e = match.end()
print ‘Found "%s" at %d:%d‘ % (text[s:e],s,e)
#finditer 返回原输入文字在字符串的位置
#4.模式语法
def test_patterns(text,patterns=[]):
for pattern,desc in patterns:
print ‘Pattern %r (%s) \n‘ %(pattern,desc)
print ‘ %r‘ % text
for match in re.finditer(pattern,text):
s = match.start()
e = match.end()
substr = text[s:e] #匹配到的字符
n_backslashes = text[:s].count(‘\\‘) #查找文本:s坐标之前的包含多少\ prefix = ‘.‘ * ( s + n_backslashes )
print ‘ %s%r‘ % (prefix,substr)
print
return
test_patterns(‘abbaaabbbbaaaaa‘,
[(‘ab‘,"‘a‘ followed by ‘b‘")]
)
#贪婪模式 这种模式会减少单个匹配减少
‘‘‘
* ‘匹配一次到多次‘
+ ‘至少匹配一次到多次‘
? ‘只匹配一次‘
ab*, ‘a followerd by zero or more b‘), #匹配0次或者更多次
ab+, ‘a followerd by one or mrore b‘), #最少匹配一次或者更多次
ab?, ‘a followerd by zero or one b‘), #匹配0最多一次
ab{3}, ‘a followerd by three b‘), #最少匹配三次
ab{2,3}, ‘a followerd by two to three b‘) #匹配两至三次
ab*?, ‘a followerd by zero or more b‘), #匹配0次或者更多次
ab+?, ‘a followerd by one or mrore b‘), #最少匹配一次或者更多次
ab??, ‘a followerd by zero or one b‘), #匹配0最多一次
ab{3}?, ‘a followerd by three b‘), #最少匹配三次
ab{2,3}?, ‘a followerd by two to three b‘) #匹配两至三次
‘‘‘
#用法如下:
str = ‘absdsdsdsdsd‘
print re.findall(‘ab*‘,str)
#[‘ab‘]
print re.findall(‘ab*?‘,str)
#[‘a‘]
#5.字符集
‘‘‘
[ab] ‘either a or b 匹配a或者b‘
a[ab]+ ‘a followerd by 1 more a or b 匹配一次a、b或者多次 ‘
a[ab]+? ‘a followerd by 1 or more a or b,not greedy 匹配1一次可以匹配多次‘
[^] ‘不包含内容‘
[a-z] ‘所有小写ASCII字母‘
[A-Z] ‘所有大写写ASCII字母‘
[a-zA-Z] ‘一个小写和大写的序列‘
[A-Za-z] ‘一个大写小写的序列‘
‘‘‘
str =‘aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbasbsbab,a_baba‘
print re.findall(‘[ab]‘,str)
print re.findall(‘a[ab]+‘,str)
print re.findall(‘a[ab]+?‘,str)
print re.findall(‘[^_]‘,str)
str = ‘China,lovE‘
print re.findall(‘[a-z][A-Z]‘,str) #[‘vE‘]
print re.findall(‘[A-Z][a-z]‘,str) #[‘Ch‘]
print re.findall(‘[A-Z][a-z]+‘,str) #[‘China‘]
print re.findall(‘[a-z][A-Z]+‘,str) #[‘vE‘]
print re.findall(‘[A-Z][a-z]*‘,str) #[‘China‘, ‘E‘]
print re.findall(‘[a-z][A-Z]*‘,str) #[‘h‘, ‘i‘, ‘n‘, ‘a‘, ‘l‘, ‘o‘, ‘vE‘]
print re.findall(‘[A-Z][a-z]?‘,str) #[‘Ch‘, ‘E‘]
print re.findall(‘[a-z][A-Z]?‘,str) #[‘h‘, ‘i‘, ‘n‘, ‘a‘, ‘l‘, ‘o‘, ‘vE‘]
‘‘‘
. 元字符匹配一个字符
a.
b.
a.*b
a.*?b
‘‘‘
c = ‘woaizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbbsd‘
print re.findall(‘a.‘,c) #[‘ai‘, ‘aw‘, ‘as‘, ‘aa‘, ‘ab‘]
print re.findall(‘b.‘,c) #[‘b,‘, ‘bs‘, ‘ba‘, ‘bb‘, ‘bb‘, ‘bb‘, ‘bs‘]
print re.findall(‘a.*b‘,c) #[‘aizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbb‘] #贪婪模式匹配a到b之间的任意字符长度字符
print re.findall(‘a.*?b‘,c) #[‘aizhongguoawsb‘, ‘asssssssssssssdsdsdsdb‘, ‘aaab‘] # ?结束了* 的贪婪模式,
#它不会到最后一个b再去匹配而且见好就收,匹配可能最短的字符
#6.转义码
‘‘‘
转义码 含义
\d 一个数字
\D 一个非字符
\s 空白符(制表符、空格、换行符)
\S 非空白符(符号、字母、数字)
\w 字母数字
\W 非字母数字(符号、制表符、空格、换行符)
‘‘‘
#7.锚定
‘‘‘
锚定码 含义
^ 字符串或行的开始
$ 字符串或行结束
\A 字符串开始
\Z 字符串结束
\b 一个单词开头或者末尾的空串
\B 不在一个单词的开头活末尾的空串
‘‘‘
#8.限制搜索 match、search
text = ‘This is some text --with punctuation.‘
pattern = ‘is‘
print ‘Text :‘,text
print ‘pattern:‘,pattern
m = re.match(pattern,text) #因为match是从字符开头开始匹配 is没有在开头所以没有匹配到.
print ‘Match :‘,m
s = re.search(pattern,text) #is在文本中出现了两次所以匹配到内容
print ‘Search :‘,s
pattern = re.compile(r‘\b\w*is\w*\b‘) #编译规则
print ‘Text:‘,text
pos = 0
while True:
match = pattern.search(text,pos) #搜索规则
if not match:
break
s = match.start()
e = match.end()
print ‘ %d : %d = "%s"‘ % (s,e-1,text[s:e])
pos = e
#9 用户组解析匹配(任何一个正则都可以为组并嵌套在一个更大的表达式中)
regex = re.compile(r‘(\bt\w+)\W+(\w+)‘)
print ‘Input text :‘,text
print ‘Pattern :‘,regex.pattern
match = regex.search(text)
print ‘Entire match :‘,match.group(0) #表示整个表达式的字符串,子组从1开始排序
print ‘World start with "t":‘,match.group(1) #匹配到的第一组
print ‘World after "t" word :‘,match.group(2) #匹配到的第二组
#python对基本分组进行了扩展 (?P<name>pattern)
print text
print
for pattern in [ r‘^(?P<first_word>\w+)‘,
r‘(?P<last_word>\w+)\S*$‘,
r‘(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)‘,
r‘(?P<ends_with_t>\w+t)\b‘,
]:
regex = re.compile(pattern)
match = regex.search(text)
print ‘Matching "%s"‘ % pattern
print ‘ ‘,match.groups()
print ‘ ‘,match.groupdict()
print继续学习中...........
本文出自 “BrotherXing” 博客,请务必保留此出处http://brotherxing.blog.51cto.com/3994225/1576216
原文地址:http://brotherxing.blog.51cto.com/3994225/1576216