python正则表达式01

re.match()

locate the keyword from the first character,emmm~ This way has its limition

import re

content = 'Hello 123 45677 World_This_is a Regex Demo'
result = re.match(r'^Hello\s\d\d\d\s\d{5}\s\w{10}.*', content)
# \s匹配空白字符 {} 匹配多少次 \w匹配字母下划线
print(result)
print(result.group())
print(result.span()) # 输出匹配结果范围
<re.Match object; span=(0, 42), match='Hello 123 45677 World_This_is a Regex Demo'>
Hello 123 45677 World_This_is a Regex Demo
(0, 42)
import re

content = 'Hello 123 45677 World_This_is a Regex Demo'
result = re.match(r'Hello.*Demo$', content)
print(result)
print(result.group())
print(result.span())
<re.Match object; span=(0, 42), match='Hello 123 45677 World_This_is a Regex Demo'>
Hello 123 45677 World_This_is a Regex Demo
(0, 42)
import re

# 匹配'1234567'
content = 'Hello 1234567 World_This_is a Regex Demo'
res = re.match(r'^Hello\s(\d+).*o$', content)
print(res.group()) # 返回的仍旧是整体
print(res.group(1))
print(res.span())
Hello 1234567 World_This_is a Regex Demo
1234567
(0, 40)
import re

content = 'Hello 1234567 World_This_is a Regex Demo'
# .*会尽可能多地匹配
result = re.match(r'^He.*(\d+).*Demo$', content)
print(result.group(1))
7
import re

content = 'Hello 1234567 World_This_is a Regex Demo'
# .*?会尽可能少地匹配
result = re.match(r'^He.*?(\d+).*Demo$', content)
print(result.group(1))
1234567
import re

content = '''Hello 1234567 World_
This_is a Regex Demo'''

# .无法匹配换行符
result = re.match(r'^He.*?(\d+).*?Demo$', content, re.S)
print(result)
print(result.group(1))
<re.Match object; span=(0, 41), match='Hello 1234567 World_\nThis_is a Regex Demo'>
1234567
import re

content = 'price is $5.00'
result = re.match('price is $5.00', content) # '$'为特殊字符,需要转义
print(result)
None
import re

content = 'price is $5.00'
# 加反斜杠\转义
result = re.match('price is \$5.00', content) # '$'为特殊字符,需要转义
print(result)
print(result.group())
<re.Match object; span=(0, 14), match='price is $5.00'>
price is $5.00

re.search()

-- search the strings and return a result meeting the condition

import re

content = 'Extra strings Hello 1234567 World_This_is a Regex Demo Extra stings'
# Look! It doesn't work now!!! Although the regex is perfect~~~
result = re.match(r'Hello.*?(\d+).*?Demo', content)
print(result)
None

Tips: Please keep a higher-priority to use re.search() rather than re.match() ~

import re

content = 'Extra strings Hello 1234567 World_This_is a Regex Demo Extra stings'
# 能用search就用search,不用管match的头部限制
result = re.search(r'Hello.*?(\d+).*?Demo', content)
print(result)
print(result.group())
print(result.group(1))
<re.Match object; span=(14, 54), match='Hello 1234567 World_This_is a Regex Demo'>
Hello 1234567 World_This_is a Regex Demo
1234567
import re

html ='''<div id="songs-list">
    <h2 class="title">经典老歌</h2>
    <p class="introduction">
        经典老歌列表
    </p>
    <ul id="list" class="list-group">
        <li data-view="2">一路上有你</li>
        <li data-view="7">
            <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
        </li>
        <li data-view="4" class="active">
            <a href="/3.mp3" singer="齐秦">往事随风</a>
        </li>
        <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
        <li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
        <li data-view="5">
            <a href="/6.mp3" singer="邓丽君"></i>但愿人长久</a>
        </li>
    </ul>
</div>'''


# extract singer and their song in <li> tags 

result = re.search(r'<li.*?active.*?singer="(.*?)">(.*?)</a>', html, re.S)
if result:
    print(result.group(1), result.group(2))
    
齐秦 往事随风

re.findall()

to find each items meeting the regex~~

import re

html ='''<div id="songs-list">
    <h2 class="title">经典老歌</h2>
    <p class="introduction">
        经典老歌列表
    </p>
    <ul id="list" class="list-group">
        <li data-view="2">一路上有你</li>
        <li data-view="7">
            <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
        </li>
        <li data-view="4" class="active">
            <a href="/3.mp3" singer="齐秦">往事随风</a>
        </li>
        <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
        <li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
        <li data-view="5">
            <a href="/6.mp3" singer="邓丽君"></i>但愿人长久</a>
        </li>
    </ul>
</div>'''

result = re.findall(r'<li.*?href="(.*?)".*?singer="(.*?)">(.*?)</a>', html, re.S)
print(result)
for i in result:
    print(i)
    print(i[0], i[1], i[2])
[('/2.mp3', '任贤齐', '沧海一声笑'), ('/3.mp3', '齐秦', '往事随风'), ('/4.mp3', 'beyond', '光辉岁月'), ('/5.mp3', '陈慧琳', '记事本'), ('/6.mp3', '邓丽君', '</i>但愿人长久')]
('/2.mp3', '任贤齐', '沧海一声笑')
/2.mp3 任贤齐 沧海一声笑
('/3.mp3', '齐秦', '往事随风')
/3.mp3 齐秦 往事随风
('/4.mp3', 'beyond', '光辉岁月')
/4.mp3 beyond 光辉岁月
('/5.mp3', '陈慧琳', '记事本')
/5.mp3 陈慧琳 记事本
('/6.mp3', '邓丽君', '</i>但愿人长久')
/6.mp3 邓丽君 </i>但愿人长久
import re

# 匹配所有歌名
html ='''<div id="songs-list">
    <h2 class="title">经典老歌</h2>
    <p class="introduction">
        经典老歌列表
    </p>
    <ul id="list" class="list-group">
        <li data-view="2">一路上有你</li>
        <li data-view="7">
            <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
        </li>
        <li data-view="4" class="active">
            <a href="/3.mp3" singer="齐秦">往事随风</a>
        </li>
        <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
        <li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
        <li data-view="5">
            <a href="/6.mp3" singer="邓丽君"></i>但愿人长久</a>
        </li>
    </ul>
</div>'''

# the <a> tag here is random,so we build regex with a pair of brackets added <a> tags
# Tips!!!  Question Mark is optional.It means that </a> tag is dispensable in regex_pattern to locate keyword.
# (<a.*?>)? ... (</a>)?
result = re.findall(r'<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>', html, re.S)
print(result)
for i in result:
    # print(i)
    print(i[1])
[('', '一路上有你', ''), ('<a href="/2.mp3" singer="任贤齐">', '沧海一声笑', '</a>'), ('<a href="/3.mp3" singer="齐秦">', '往事随风', '</a>'), ('<a href="/4.mp3" singer="beyond">', '光辉岁月', '</a>'), ('<a href="/5.mp3" singer="陈慧琳">', '记事本', '</a>'), ('<a href="/6.mp3" singer="邓丽君"></i>', '但愿人长久', '</a>')]
一路上有你
沧海一声笑
往事随风
光辉岁月
记事本
但愿人长久

re.sub()

To relpace the strings meeting the regex with your DIY sting~

import re 

content = 'Extra strings Hello 1234567 World_This_is a Regex Demo Extra sstrings'
content = re.sub('\d+', '', content)
print(content)
Extra strings Hello  World_This_is a Regex Demo Extra sstrings

1first para is the regex_pattern to locate the special string needed to be substituted
2second para: the string you want to replace
3third para: original string

import re 
content = 'Extra strings Hello 1234567 World_This_is a Regex Demo Extra sstrings'
content = re.sub('\d+', 'Replacement', content)
print(content)
Extra strings Hello Replacement World_This_is a Regex Demo Extra sstrings
Last modification:April 23rd, 2019 at 10:05 pm

Leave a Comment