Zanurkuj w Pythonie/Przetwarzanie HTML-a: Różnice pomiędzy wersjami

Usunięta treść Dodana treść
Piotr (dyskusja | edycje)
poprawki
Piotr (dyskusja | edycje)
Nie podano opisu zmian
Linia 92:
|tekst=
<nowiki>
#-*- coding: utf-8 -*-
import re
 
from BaseHTMLProcessor import BaseHTMLProcessor
import re
classfrom BaseHTMLProcessor import Dialectizer(BaseHTMLProcessor):
 
subs = ()
class Dialectizer(BaseHTMLProcessor):
subs def= reset(self):
# extend (called from __init__ in ancestor)
def reset(self):
# Reset all data attributes
# dodatek (wywoływany przez __init__ klasy bazowej)
self.verbatim = 0
# BaseHTMLProcessor.reset(self)Resetuje wszystkie atrybuty
self.verbatim = 0
BaseHTMLProcessor.reset(self)
def start_pre(self, attrs):
# called for every <pre> tag in HTML source
def start_pre(self, attrs):
# Increment verbatim mode count, then handle tag like normal
# wywoływane dla każdego znacznika <pre> w źródle HTML
self.verbatim += 1
# Zwiększa licznik trybu dosłowności verbatim, a następnie
self.unknown_starttag("pre", attrs)
# obsługuje ten znacznik normalnie
self.verbatim += 1
def end_pre(self):
# called for every </self.unknown_starttag("pre> tag in HTML", sourceattrs)
 
# Decrement verbatim mode count
def end_pre(self):
self.unknown_endtag("pre")
# wywoływane dla każdego znacznika </pre>
self.verbatim -= 1
# Zmiejsza licznik trybu dosłowności verbatim
self.unknown_endtag("pre")
def handle_data(self, text):
self.verbatim #-= override1
 
# called for every block of text in HTML source
def handle_data(self, text):
# If in verbatim mode, save text unaltered;
# metoda nadpisana
# otherwise process the text with a series of substitutions
# wywoływane dla każdego bloku tekstu w źródle
self.pieces.append(self.verbatim and text or self.process(text))
# Jeśli jest w trybie dosłownym, zapisuje tekst niezmieniony;
# inaczej przetwarza tekst za pomocą szeregu podstawień
def process(self, text):
self.pieces.append(self.verbatim and text or self.process(text))
# called from handle_data
 
# Process text block by performing series of regular expression
def process(self, text):
# substitutions (actual substitions are defined in descendant)
# wywoływane z handle_data
for fromPattern, toPattern in self.subs:
# Przetwarza każdy blok wykonując serie podstawień
text = re.sub(fromPattern, toPattern, text)
# za pomocą wyrażeń regularnych (podstawienia są definiowane przez klasy pochodne)
return text
for fromPattern, toPattern in self.subs:
text = re.sub(fromPattern, toPattern, text)
class ChefDialectizer(Dialectizer):
"""convert HTML to Swedishreturn Chef-speaktext
 
class ChefDialectizer(Dialectizer):
based on the classic chef.x, copyright (c) 1992, 1993 John Hagerman
u"""konwertuje HTML na mowę szwedzkiego szefa kuchni
"""
 
subs = ((r'a([nu])', r'u\1'),
oparte na klasycznym chef.x, copyright (c) 1992, 1993 John Hagerman
(r'A([nu])', r'U\1'),
"""
(r'a\B', r'e'),
subs = ((r'A\Ba([nu])', r'Eu\1'),
(r'en\bA([nu])', r'eeU\1'),
(r'a\BewB', r'ooe'),
(r'A\Be\bB', r'e-aE'),
(r'en\beb', r'iee'),
(r'\bEBew', r'Ioo'),
(r'\BfBe\b', r'ffe-a'),
(r'\Birbe', r'uri'),
(r'(\w*?)i(\w*?)$bE', r'\1ee\2I'),
(r'\bowBf', r'ooff'),
(r'\boBir', r'oour'),
(r'(\bOw*?)i(\w*?)$', r'Oo\1ee\2'),
(r'the\bow', r'zeeoo'),
(r'The\bo', r'Zeeoo'),
(r'th\bbO', r'tOo'),
(r'\Btionthe', r'shunzee'),
(r'\BuThe', r'ooZee'),
(r'th\BUb', r'Oot'),
(r'v\Btion', r'fshun'),
(r'V\Bu', r'Foo'),
(r'w\BU', r'wOo'),
(r'Wv', r'Wf'),
(r'([a-z])[.]V', r'\1. Bork Bork Bork!F')),
(r'w', r'w'),
(r'W', r'W'),
class FuddDialectizer(Dialectizer):
(r'([a-z])[.]', r'\1. Bork Bork Bork!'))
"""convert HTML to Elmer Fudd-speak"""
 
subs = ((r'[rl]', r'w'),
class FuddDialectizer(Dialectizer):
(r'qu', r'qw'),
u"""konwertuje HTML na mowę Elmer Fudda"""
(r'th\b', r'f'),
subs = ((r'th[rl]', r'dw'),
(r'n[.]qu', r'n, uh-hah-hah-hah.qw')),
(r'th\b', r'f'),
(r'th', r'd'),
class OldeDialectizer(Dialectizer):
(r'n[.]', r'n, uh-hah-hah-hah.'))
"""convert HTML to mock Middle English"""
 
subs = ((r'i([bcdfghjklmnpqrstvwxyz])e\b', r'y\1'),
class OldeDialectizer(Dialectizer):
(r'i([bcdfghjklmnpqrstvwxyz])e', r'y\1\1e'),
u"""konwertuje HTML na pozorowany język średnioangielski"""
(r'ick\b', r'yk'),
subs = ((r'iai([bcdfghjklmnpqrstvwxyz])e\b', r'ey\1e1'),
(r'e[ea]i([bcdfghjklmnpqrstvwxyz])e', r'ey\1\1e'),
(r'([bcdfghjklmnpqrstvwxyz])yick\b', r'\1eeyk'),
(r'ia([bcdfghjklmnpqrstvwxyz])er', r'e\1re1e'),
(r'e[ea]([aeioubcdfghjklmnpqrstvwxyz])re\b', r'e\1r1e'),
(r'ia([bcdfghjklmnpqrstvwxyz])y', r'i\1e1ee'),
(r'tion\b([bcdfghjklmnpqrstvwxyz])er', r'cioun\1re'),
(r'ion([aeiou])re\b', r'ioun\1r'),
(r'aidia([bcdfghjklmnpqrstvwxyz])', r'aydei\1e'),
(r'aition\b', r'eycioun'),
(r'ayion\b', r'yioun'),
(r'ayaid', r'eyayde'),
(r'antai', r'auntey'),
(r'eaay\b', r'eey'),
(r'oaay', r'ooey'),
(r'ueant', r'eaunt'),
(r'oeea', r'oee'),
(r'ouoa', r'owoo'),
(r'owue', r'oue'),
(r'\bheoe', r'hio'),
(r've\bou', r'vethow'),
(r'se\bow', r'eou'),
(r"'s\b"bhe', r'eshi'),
(r'icve\b', r'ickveth'),
(r'icsse\b', r'icce'),
(r"'icals\b'", r'ickes'),
(r'tleic\b', r'tilick'),
(r'llics\b', r'licc'),
(r'ouldical\b', r'oldeick'),
(r'owntle\b', r'ounetil'),
(r'unll\b', r'onnel'),
(r'rryould\b', r'ryeolde'),
(r'estown\b', r'esteoune'),
(r'ptun\b', r'pteonne'),
(r'thrry\b', r'therye'),
(r'chest\b', r'cheeste'),
(r'sspt\b', r'ssepte'),
(r'([wybdp])th\b', r'\1ethe'),
(r'([rnt])ch\b', r'\1\1eche'),
(r'fromss\b', r'frosse'),
(r'when([wybdp])\b', r'whan\1e')),
(r'([rnt])\b', r'\1\1e'),
(r'from', r'fro'),
def translate(url, dialectName="chef"):
(r'when', r'whan'))
"""fetch URL and translate using dialect
 
def translate(url, dialectName="chef"):
dialect in ("chef", "fudd", "olde")"""
u"""pobiera plik na podstawie URL-a
import urllib
i tłumaczy korzystając z dialektu, gdzie
sock = urllib.urlopen(url)
dialekt in ("chef", "fudd", "olde")"""
htmlSource = sock.read()
import urllib
sock.close()
sock = urllib.urlopen(url)
parserName = "%sDialectizer" % dialectName.capitalize()
htmlSource = sock.read()
parserClass = globals()[parserName]
sock.close()
parser = parserClass()
parserName = "%sDialectizer" % dialectName.capitalize()
parser.feed(htmlSource)
parserClass = globals()[parserName]
parser.close()
parser return= parser.outputparserClass()
parser.feed(htmlSource)
parser.close()
def test(url):
return parser.output()
"""test all dialects against URL"""
 
for dialect in ("chef", "fudd", "olde"):
def test(url):
outfile = "%s.html" % dialect
u"""testuje wszystkie dialekty na pewnym URL-u"""
fsock = open(outfile, "wb")
for dialect in ("chef", "fudd", "olde"):
fsock.write(translate(url, dialect))
outfile fsock= "%s.close()html" % dialect
fsock import= webbrowseropen(outfile, "wb")
webbrowserfsock.open_newwrite(outfiletranslate(url, dialect))
fsock.close()
import webbrowser
if __name__ == "__main__":
webbrowser.open_new(outfile)
test("http://diveintopython.org/odbchelper_list.html")
if __name__ == "__main__":
test("http://diveintopython.org/odbchelper_list.html")
 
</nowiki>
}}