Skip to main content

🚣 Remover

Introduction

The remover is a module that given an html string it removes all the tags that are not necessary

Implementation

""" 
Module for minimizing the code
"""
from bs4 import BeautifulSoup
from minify_html import minify


def remover(html_content: str) -> str:
"""
This function processes HTML content, removes unnecessary tags
(including style tags), minifies the HTML, and retrieves the
title and body content.

Parameters:
html_content (str): The HTML content to parse

Returns:
str: The parsed title followed by the minified body content
"""

soup = BeautifulSoup(html_content, 'html.parser')

# Title Extraction
title_tag = soup.find('title')
title = title_tag.get_text() if title_tag else ""

# Script and Style Tag Removal
for tag in soup.find_all(['script', 'style']):
tag.extract()

# Body Extraction (if it exists)
body_content = soup.find('body')
if body_content:
# Minify the HTML within the body tag
minimized_body = minify(str(body_content))
return "Title: " + title + ", Body: " + minimized_body

return "Title: " + title + ", Body: No body content found"

Example

""" 
Example of the remover method
"""
from scrapegraphai.utils.remover import remover

HTML_CONTENT = """
<html>
<head>
<title>Test Page</title>
</head>
<body>
<h1>This is a Test</h1>
<p>Hello, World!</p>
<script>alert("This is a script");</script>
</body>
</html>
"""

parsed_content = remover(HTML_CONTENT)

print(parsed_content)