-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwordCloud.py
More file actions
98 lines (62 loc) · 2.71 KB
/
wordCloud.py
File metadata and controls
98 lines (62 loc) · 2.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import json
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import string
def remove_punctuation(text):
translator = str.maketrans('', '', string.punctuation)
return text.translate(translator)
with open('data/test_characteristics_human.json', 'r') as json_file:
data = json.load(json_file)
attribute_texts = []
for key, value in data.items():
# Include the attribute key and its value
attribute_text = f"{key}: {str(value)}"
attribute_texts.append(attribute_text)
text_data = " ".join(attribute_texts)
text_data = text_data.lower()
text_data = remove_punctuation(text_data)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_data)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
def scrape_characteristics(geo_id):
url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={geo_id}"
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
characteristics_dict = {}
# Find the "Characteristics" label
characteristics_label = soup.find('td', text='Characteristics')
if characteristics_label:
# Get the next sibling (which contains the characteristics content)
characteristics_content = characteristics_label.find_next_sibling('td')
characteristics_string = str(characteristics_content)
return characteristics_string
else:
return f"Failed to retrieve the page. Status code: {response.status_code}"
except requests.exceptions.RequestException as e:
return f"Error: {e}"
def extract_characteristics(input_str):
input_str = re.sub(r'<td[^>]*>', '', input_str) # remove <td> tags
pattern = r'(\w+): ([^<]+)'
matches = re.findall(pattern, input_str)
characteristics_dictionary = {}
for attribute, value in matches:
characteristics_dictionary[attribute] = characteristics_dictionary.get(attribute, 0) + 1
# characteristics_dictionary = dict(matches)
return characteristics_dictionary
attribute_counts = {}
for geo_id in GSM:
characteristics_string = scrape_characteristics(geo_id)
characteristics_dictionary = extract_characteristics(characteristics_string)
for attribute in characteristics_dictionary:
if attribute in attribute_counts:
attribute_counts[attribute] += 1
else:
attribute_counts[attribute] = 1
# print(f"Characteristics for {geo_id}: {characteristics_dictionary}")
print("Attribute Counts:")
for attribute, count in attribute_counts.items():
print(f"{attribute}: {count}")