phase0: archive Python implementation under archive/

This commit is contained in:
Sergey Filkin
2026-04-18 11:29:36 +03:00
parent 771169f93f
commit 425eae7170
20 changed files with 558 additions and 0 deletions
+103
View File
@@ -0,0 +1,103 @@
import json
import os
import re
from functools import lru_cache
from html.parser import HTMLParser
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
API_URL = "https://api.github.com/markdown"
API_VERSION = "2022-11-28"
TEMPLATE_PATH = Path(__file__).resolve().parent.parent / "template.html"
class FirstHeadingParser(HTMLParser):
def __init__(self) -> None:
super().__init__()
self._capture = False
self._done = False
self._parts: list[str] = []
def handle_starttag(self, tag: str, attrs) -> None:
if self._done:
return
if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
self._capture = True
def handle_endtag(self, tag: str) -> None:
if self._capture and tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
self._capture = False
self._done = True
def handle_data(self, data: str) -> None:
if self._capture and not self._done:
self._parts.append(data)
def title(self) -> str:
return "".join(self._parts).strip()
def render_markdown(markdown_text: str) -> str:
payload = json.dumps({"text": markdown_text}).encode("utf-8")
headers = {
"Accept": "text/html",
"Content-Type": "application/json",
"User-Agent": "md-to-html-service",
"X-GitHub-Api-Version": API_VERSION,
}
github_token = os.getenv("GITHUB_TOKEN")
if github_token:
headers["Authorization"] = f"Bearer {github_token}"
request = Request(API_URL, data=payload, headers=headers, method="POST")
try:
with urlopen(request, timeout=30) as response:
return response.read().decode("utf-8")
except HTTPError as exc:
error_body = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(
f"GitHub API error: {exc.code} {exc.reason}\n{error_body}"
) from exc
except URLError as exc:
raise RuntimeError(f"Failed to reach GitHub API: {exc.reason}") from exc
def extract_title(html_text: str, fallback: str) -> str:
parser = FirstHeadingParser()
parser.feed(html_text)
return parser.title() or fallback
def apply_template(template_text: str, html_text: str, title: str) -> str:
updated = re.sub(
r"<title>.*?</title>",
f"<title>{title}</title>",
template_text,
flags=re.DOTALL,
)
output_lines = []
inserted = False
html_lines = [f" {line}" if line else "" for line in html_text.splitlines()]
for line in updated.splitlines():
if not inserted and "Markdown -->" in line:
output_lines.extend(html_lines)
inserted = True
continue
output_lines.append(line)
if not inserted:
raise RuntimeError("Template placeholder not found.")
return "\n".join(output_lines) + "\n"
@lru_cache(maxsize=1)
def load_template_text() -> str:
return TEMPLATE_PATH.read_text(encoding="utf-8")
def convert(markdown_text: str, fallback_title: str = "Document") -> str:
html_text = render_markdown(markdown_text)
title = extract_title(html_text, fallback_title)
template_text = load_template_text()
return apply_template(template_text, html_text, title)