-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnlp3.py
More file actions
46 lines (33 loc) · 1.31 KB
/
nlp3.py
File metadata and controls
46 lines (33 loc) · 1.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 3 17:53:07 2018
@author: ahanmr
"""
tweets=['This is the best #nlp exercise ive found online! #python',
'#NLP is super fun! <3 #learning',
'Thanks @datacamp :) #nlp #python']
# Import the necessary modules
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer
# Define a regex pattern to find hashtags: pattern1
pattern1 = r"#\w+"
# Use the pattern on the first tweet in the tweets list
regexp_tokenize(tweets[0], pattern1)
# Write a pattern that matches both mentions and hashtags
pattern2 = r"([#@]\w+)"
# Use the pattern on the last tweet in the tweets list
regexp_tokenize(tweets[-1], pattern2)
# Use the TweetTokenizer to tokenize all tweets into one list
tknzr = TweetTokenizer(tweets)
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)
german_text='Wann gehen wir zum Pizza? 🍕 Und fährst du mit Über? 🚕'
# Tokenize and print all words in german_text
all_words = word_tokenize(german_text)
print(all_words)
# Tokenize and print only capital words
capital_words = r"[A-ZÜ]\w+"
print(regexp_tokenize(german_text, capital_words))
# Tokenize and print only emoji
emoji = "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
print(regexp_tokenize(german_text,emoji))