how CJK is your page?

i made a python app that scans the content of a web page to see how Japanese it is based on the characters used. so it could be Japanese, Chinese etc. here it is:

# -*- coding: utf-8 -*-

from sys import argv
from urllib2 import build_opener
from HTMLParser import HTMLParser

class jaHTMLParser(HTMLParser):

ja = nonja = 0
encoding = “utf-8”

def handle_starttag(self, tag, attrs):
     for attr in attrs:
         if tag == ‘meta’ and attr[0] == “content” and attr[1].find(“charset=”) != -1 :
         self.encoding = attr[1].split(“charset=”)[1]

def handle_data(self, data):
     if data in (“/*”, “*/”) or data.isspace(): return
    uni = data.decode(self.encoding)
     for c in uni:
         u8 = c.encode(“utf-8”)
        if u8 >= ‘⺀’ and u8 <= ‘𯨟’: self.ja += 1 # very approximate
         else: self.nonja += 1

def unknown_decl(self, data): pass # CDATA is not an error!

opener = build_opener()
opener.addheaders = [(‘User-agent’, ‘Mozilla/5.0’)] # avoid 403 forbidden

reader = jaHTMLParser()
reader.CDATA_CONTENT_ELEMENTS = [] # don’t treat any CDATA as textual content
reader.feed(opener.open(argv[1]).read())
reader.close()
print “%d%%” % (float(100 * reader.ja) / float(reader.nonja + reader.ja))

🙂

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s