|
| 1 | +''' |
| 2 | +MIT License |
| 3 | +Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk). |
| 4 | +Project: Harmony (https://harmonydata.ac.uk) |
| 5 | +Maintainer: Thomas Wood (https://fastdatascience.com) |
| 6 | +Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | +of this software and associated documentation files (the "Software"), to deal |
| 8 | +in the Software without restriction, including without limitation the rights |
| 9 | +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 10 | +copies of the Software, and to permit persons to whom the Software is |
| 11 | +furnished to do so, subject to the following conditions: |
| 12 | +The above copyright notice and this permission notice shall be included in all |
| 13 | +copies or substantial portions of the Software. |
| 14 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 15 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 16 | +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 17 | +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 18 | +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 19 | +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 20 | +SOFTWARE. |
| 21 | +''' |
| 22 | + |
| 23 | +from typing import List |
| 24 | +from harmony.schemas.requests.text import RawFile, Instrument, Question |
| 25 | +from harmony.parsing.util import normalise_text |
| 26 | + |
| 27 | +# Try to import BeautifulSoup, fall back to basic text extraction if not available |
| 28 | +try: |
| 29 | + from bs4 import BeautifulSoup |
| 30 | + BEAUTIFULSOUP_AVAILABLE = True |
| 31 | +except ImportError: |
| 32 | + BEAUTIFULSOUP_AVAILABLE = False |
| 33 | + |
| 34 | +# Try to import lxml for better performance, fall back to html.parser |
| 35 | +try: |
| 36 | + import lxml |
| 37 | + DEFAULT_PARSER = 'lxml' |
| 38 | +except ImportError: |
| 39 | + DEFAULT_PARSER = 'html.parser' |
| 40 | + |
| 41 | + |
| 42 | +def convert_html_to_instruments(file: RawFile) -> List[Instrument]: |
| 43 | + """ |
| 44 | + Convert HTML file to Harmony instruments by extracting text content. |
| 45 | + |
| 46 | + This function parses HTML files and extracts meaningful text content, |
| 47 | + attempting to preserve semantic structure while removing HTML tags. |
| 48 | + Uses BeautifulSoup if available for better parsing, otherwise falls |
| 49 | + back to basic text extraction. |
| 50 | + |
| 51 | + Args: |
| 52 | + file (RawFile): The raw HTML file to parse |
| 53 | + |
| 54 | + Returns: |
| 55 | + List[Instrument]: List of instruments extracted from the HTML |
| 56 | + """ |
| 57 | + |
| 58 | + if not file.content: |
| 59 | + return [] |
| 60 | + |
| 61 | + # Extract text content from HTML |
| 62 | + if BEAUTIFULSOUP_AVAILABLE: |
| 63 | + text_content = _extract_text_with_beautifulsoup(file.content) |
| 64 | + else: |
| 65 | + text_content = _extract_text_basic(file.content) |
| 66 | + |
| 67 | + if not text_content.strip(): |
| 68 | + return [] |
| 69 | + |
| 70 | + # Create questions from extracted text |
| 71 | + questions = _extract_questions_from_text(text_content) |
| 72 | + |
| 73 | + if not questions: |
| 74 | + return [] |
| 75 | + |
| 76 | + # Create instrument |
| 77 | + instrument = Instrument( |
| 78 | + file_id=file.file_id, |
| 79 | + instrument_name=file.file_name or "HTML Document", |
| 80 | + questions=questions, |
| 81 | + language="en" # Default to English, could be enhanced with language detection |
| 82 | + ) |
| 83 | + |
| 84 | + return [instrument] |
| 85 | + |
| 86 | + |
| 87 | +def _extract_text_with_beautifulsoup(html_content: str) -> str: |
| 88 | + """ |
| 89 | + Extract text content from HTML using BeautifulSoup. |
| 90 | + |
| 91 | + This provides better text extraction by: |
| 92 | + - Removing script and style tags |
| 93 | + - Preserving semantic structure |
| 94 | + - Handling HTML entities properly |
| 95 | + |
| 96 | + Args: |
| 97 | + html_content (str): Raw HTML content |
| 98 | + |
| 99 | + Returns: |
| 100 | + str: Extracted text content |
| 101 | + """ |
| 102 | + try: |
| 103 | + soup = BeautifulSoup(html_content, DEFAULT_PARSER) |
| 104 | + |
| 105 | + # Remove script and style elements |
| 106 | + for element in soup(["script", "style"]): |
| 107 | + element.decompose() |
| 108 | + |
| 109 | + # Get text content |
| 110 | + text = soup.get_text() |
| 111 | + |
| 112 | + # Clean up whitespace |
| 113 | + lines = (line.strip() for line in text.splitlines()) |
| 114 | + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
| 115 | + text = ' '.join(chunk for chunk in chunks if chunk) |
| 116 | + |
| 117 | + return text |
| 118 | + |
| 119 | + except Exception as e: |
| 120 | + # Fall back to basic extraction if BeautifulSoup fails |
| 121 | + return _extract_text_basic(html_content) |
| 122 | + |
| 123 | + |
| 124 | +def _extract_text_basic(html_content: str) -> str: |
| 125 | + """ |
| 126 | + Basic text extraction from HTML without external dependencies. |
| 127 | + |
| 128 | + This is a fallback method that uses simple string operations |
| 129 | + to remove HTML tags when BeautifulSoup is not available. |
| 130 | + |
| 131 | + Args: |
| 132 | + html_content (str): Raw HTML content |
| 133 | + |
| 134 | + Returns: |
| 135 | + str: Extracted text content |
| 136 | + """ |
| 137 | + import re |
| 138 | + |
| 139 | + # Remove HTML tags |
| 140 | + text = re.sub(r'<[^>]+>', ' ', html_content) |
| 141 | + |
| 142 | + # Handle common HTML entities |
| 143 | + html_entities = { |
| 144 | + '&': '&', |
| 145 | + '<': '<', |
| 146 | + '>': '>', |
| 147 | + '"': '"', |
| 148 | + ''': "'", |
| 149 | + ' ': ' ' |
| 150 | + } |
| 151 | + |
| 152 | + for entity, replacement in html_entities.items(): |
| 153 | + text = text.replace(entity, replacement) |
| 154 | + |
| 155 | + # Clean up whitespace |
| 156 | + text = re.sub(r'\s+', ' ', text).strip() |
| 157 | + |
| 158 | + return text |
| 159 | + |
| 160 | + |
| 161 | +def _extract_questions_from_text(text: str) -> List[Question]: |
| 162 | + """ |
| 163 | + Extract potential questions from text content. |
| 164 | + |
| 165 | + This function looks for question-like patterns in the text and |
| 166 | + creates Question objects from them. It uses heuristics to identify |
| 167 | + sentences that might be questionnaire items. |
| 168 | + |
| 169 | + Args: |
| 170 | + text (str): Extracted text content |
| 171 | + |
| 172 | + Returns: |
| 173 | + List[Question]: List of identified questions |
| 174 | + """ |
| 175 | + questions = [] |
| 176 | + |
| 177 | + # Normalize the text |
| 178 | + normalized_text = normalise_text(text) |
| 179 | + |
| 180 | + # Split into sentences/lines for potential questions |
| 181 | + # Use multiple delimiters to split the text |
| 182 | + import re |
| 183 | + sentences = re.split(r'[.!?\n\r]+', normalized_text) |
| 184 | + |
| 185 | + for i, sentence in enumerate(sentences): |
| 186 | + sentence = sentence.strip() |
| 187 | + |
| 188 | + # Skip very short or empty sentences |
| 189 | + if len(sentence) < 10: |
| 190 | + continue |
| 191 | + |
| 192 | + # Skip sentences that are likely not questions |
| 193 | + if _is_likely_question(sentence): |
| 194 | + question = Question( |
| 195 | + question_no=str(i + 1), |
| 196 | + question_intro="", |
| 197 | + question_text=sentence, |
| 198 | + options=None, |
| 199 | + source_page=1 |
| 200 | + ) |
| 201 | + questions.append(question) |
| 202 | + |
| 203 | + return questions |
| 204 | + |
| 205 | + |
| 206 | +def _is_likely_question(text: str) -> bool: |
| 207 | + """ |
| 208 | + Determine if a text segment is likely to be a questionnaire item. |
| 209 | + |
| 210 | + Uses heuristics to identify potential questionnaire items: |
| 211 | + - Contains question words or patterns |
| 212 | + - Has appropriate length |
| 213 | + - Doesn't look like navigation or metadata |
| 214 | + |
| 215 | + Args: |
| 216 | + text (str): Text segment to evaluate |
| 217 | + |
| 218 | + Returns: |
| 219 | + bool: True if the text is likely a question |
| 220 | + """ |
| 221 | + text_lower = text.lower() |
| 222 | + |
| 223 | + # Skip navigation and common non-question patterns |
| 224 | + skip_patterns = [ |
| 225 | + 'click here', 'read more', 'continue', 'next', 'previous', |
| 226 | + 'home', 'about', 'contact', 'privacy', 'terms', |
| 227 | + 'copyright', 'all rights reserved', 'menu', 'navigation' |
| 228 | + ] |
| 229 | + |
| 230 | + for pattern in skip_patterns: |
| 231 | + if pattern in text_lower: |
| 232 | + return False |
| 233 | + |
| 234 | + # Look for question indicators |
| 235 | + question_indicators = [ |
| 236 | + 'how', 'what', 'when', 'where', 'why', 'who', 'which', |
| 237 | + 'do you', 'are you', 'have you', 'would you', 'could you', |
| 238 | + 'please', 'rate', 'scale', 'agree', 'disagree', 'often', |
| 239 | + 'never', 'sometimes', 'always', 'feel', 'think', 'believe' |
| 240 | + ] |
| 241 | + |
| 242 | + # Check for question indicators |
| 243 | + for indicator in question_indicators: |
| 244 | + if indicator in text_lower: |
| 245 | + return True |
| 246 | + |
| 247 | + # Check if it ends with a question mark |
| 248 | + if text.strip().endswith('?'): |
| 249 | + return True |
| 250 | + |
| 251 | + # Check length - typical questionnaire items are of reasonable length |
| 252 | + if 20 <= len(text) <= 200: |
| 253 | + # Additional heuristics for questionnaire-like content |
| 254 | + if any(word in text_lower for word in ['you', 'your', 'i', 'my']): |
| 255 | + return True |
| 256 | + |
| 257 | + return False |
0 commit comments