```python
import re
```
```python
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''
sentence = "Start a sentence and then bring it to an end"
```
#### raw string
```python
print("\tTab")
print(r"\tTab")
```
Tab
\tTab
#### re.compile() method
- seperate a pattern into a variable
```python
pattern = re.compile(r"abc")
```
```python
matches = pattern.finditer(text_to_search)
for match in matches:
print(match)
```
<re.Match object; span=(1, 4), match='abc'>
```python
print(text_to_search[1:4])
```
abc
#### Rules:
```
Match any sigle digit:
. - Any Character Except New Line
\d - Digit (0-9)
\D - Not a Digit (0-9)
\w - Word Character (a-z, A-Z, 0-9, _)
\W - Not a Word Character
\s - Whitespace (space, tab, newline)
\S - Not Whitespace (space, tab, newline)
Anchors:
\b - Word Boundary
\B - Not a Word Boundary
^ - Beginning of a String
$ - End of a String
[] - Matches Characters in brackets
[^ ] - Matches Characters NOT in brackets
| - Either Or
( ) - Group
Quantifiers:
* - 0 or More
+ - 1 or More
? - 0 or One
{3} - Exact Number
{3,4} - Range of Numbers (Minimum, Maximum)
#### Sample Regexs ####
[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+
```
#### basic rules
```python
pattern = re.compile(r"\s")
matches = pattern.findall(text_to_search)
print(matches)
```
['\n', '\n', '\n', '\n', '\n', ' ', '\n', '\n', ' ', ' ', ' ', ' ', '\n', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', ' ', '\n', ' ', '\n', ' ', '\n', ' ', '\n', ' ', '\n']
#### anchors
```python
# the last ha is not matched cuz there is no word boundary
pattern = re.compile(r"\bHa")
matches = pattern.finditer(text_to_search)
for match in matches:
print(match)
```
<re.Match object; span=(67, 69), match='Ha'>
<re.Match object; span=(70, 72), match='Ha'>
```python
# match the word that do not have a word boundary
pattern = re.compile(r"\BHa")
matches = pattern.finditer(text_to_search)
for match in matches:
print(match)
```
<re.Match object; span=(72, 74), match='Ha'>
```python
# ^ means at the start
pattern = re.compile(r"^S")
matches = pattern.finditer(sentence)
for match in matches:
print(match)
```
<re.Match object; span=(0, 1), match='S'>
```python
# $ means end with
pattern = re.compile(r"end$")
matches = pattern.finditer(sentence)
for match in matches:
print(match)
```
<re.Match object; span=(41, 44), match='end'>
#### case: match the phone number
```python
pattern = re.compile(r"\d\d\d.\d\d\d.\d\d\d\d")
matches = pattern.finditer(text_to_search)
for match in matches:
print(match)
```
<re.Match object; span=(155, 167), match='321-555-4321'>
<re.Match object; span=(168, 180), match='123.555.1234'>
<re.Match object; span=(181, 193), match='123*555*1234'>
<re.Match object; span=(194, 206), match='800-555-1234'>
<re.Match object; span=(207, 219), match='900-555-1234'>
#### case: txt file
```python
with open("data.txt", "r", encoding="utf-8") as file:
contents = file.read()
print(contents)
```
```python
pattern = re.compile(r"\d\d\d[-.]\d\d\d[-.]\d\d\d\d")
matches = pattern.finditer(contents)
for match in matches:
print(match)
```
```python
# match phone start with 800 and 900
pattern = re.compile(r"[89]00[-.]\d\d\d[-.]\d\d\d\d")
matches = pattern.finditer(contents)
for match in matches:
print(match)
```
<re.Match object; span=(102, 114), match='800-555-5669'>
<re.Match object; span=(281, 293), match='900-555-9340'>
<re.Match object; span=(467, 479), match='800-555-6771'>
<re.Match object; span=(1091, 1103), match='900-555-3205'>
<re.Match object; span=(1439, 1451), match='800-555-6089'>
<re.Match object; span=(1790, 1802), match='800-555-7100'>
<re.Match object; span=(2051, 2063), match='900-555-5118'>
<re.Match object; span=(2826, 2838), match='900-555-5428'>
<re.Match object; span=(3284, 3296), match='800-555-8810'>
<re.Match object; span=(3971, 3983), match='900-555-9598'>
<re.Match object; span=(4945, 4957), match='800-555-2420'>
<re.Match object; span=(5566, 5578), match='900-555-3567'>
<re.Match object; span=(6189, 6201), match='800-555-3216'>
<re.Match object; span=(6889, 6901), match='900-555-7755'>
<re.Match object; span=(7864, 7876), match='800-555-1372'>
<re.Match object; span=(8741, 8753), match='900-555-6426'>
```python
# rewrite with quantifier
pattern = re.compile(r"\d{3}[-.]\d{3}[-.]\d{4}")
matches = pattern.finditer(contents)
for match in matches:
print(match)
```
```python
# search for all mr.
pattern = re.compile(r"Mr\.?\s[A-Z]\w*")
matches = pattern.finditer(text_to_search)
for match in matches:
print(match)
```
<re.Match object; span=(221, 232), match='Mr. Schafer'>
<re.Match object; span=(233, 241), match='Mr Smith'>
<re.Match object; span=(265, 270), match='Mr. T'>
```python
# search for all mr ms mrs.
pattern = re.compile(r"(Mr|Ms|Mrs)\.?\s[A-Z]\w*")
matches = pattern.finditer(text_to_search)
for match in matches:
print(match)
```
<re.Match object; span=(221, 232), match='Mr. Schafer'>
<re.Match object; span=(233, 241), match='Mr Smith'>
<re.Match object; span=(242, 250), match='Ms Davis'>
<re.Match object; span=(251, 264), match='Mrs. Robinson'>
<re.Match object; span=(265, 270), match='Mr. T'>
#### Let's match some emails
```python
emails = """
CoreyMSchafer@gmail.com
core.schager@university.edu
core-321-schafer@my-work.net
"""
```
```python
pattern = re.compile(r"[a-zA-Z0-9-.]+@[a-zA-Z-]+\.(com|edu|net)")
matches = pattern.finditer(emails)
for match in matches:
print(match)
```
<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 52), match='core.schager@university.edu'>
<re.Match object; span=(53, 81), match='core-321-schafer@my-work.net'>
```python
pattern = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
matches = pattern.finditer(emails)
for match in matches:
print(match)
```
<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 52), match='core.schager@university.edu'>
<re.Match object; span=(53, 81), match='core-321-schafer@my-work.net'>
#### Some urls
```python
urls = """
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
"""
```
```python
pattern = re.compile(r"https?://(www\.)?\w+\.\w+")
matches = pattern.finditer(urls)
for match in matches:
print(match)
```
<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>
```python
pattern = re.compile(r"https?://(www\.)?(\w+)(\.\w+)")
matches = pattern.finditer(urls)
for match in matches:
print(match.group(2))
```
google
coreyms
youtube
nasa
### Some application - email validation
```python
import re
def is_valid_email(email):
pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
if re.search(pattern, email, re.IGNORECASE):
return True
else:
return False
```