Last active
April 25, 2016 01:45
-
-
Save iomz/4048e3011031d1cfed56 to your computer and use it in GitHub Desktop.
For extracting grade from comments in SFC-SFS assignment page
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from HTMLParser import HTMLParser | |
from re import match, search | |
from sys import exit, stdout | |
import csv | |
class HWParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.in_table = False | |
self.in_meta_tr = False # Includes whose sumbission | |
self.in_answer_tr = False # Answer and comment with grading | |
self.in_meta_td = False | |
self.in_answer_td = False | |
self.in_comment_ul = False | |
self.in_comment_li = False | |
self.last_read_student_id = '' | |
self.last_read_cns_id = '' | |
self.last_read_grade = '' | |
self.comment_str = '' | |
self.grades = {} | |
self.in_div_class_ja = False | |
self.in_h4_class_one = False | |
self.hw_title = '' | |
def handle_starttag(self, tag, attrs): | |
# Determine the submission table | |
if tag == 'table' and ('cellspacing', '2') in attrs: | |
self.in_table = True | |
# Determine the type of tr and td | |
if not self.in_table: | |
pass | |
elif tag == 'tr': | |
# tr with bgcolor is the meta tr | |
if ('bgcolor', '#e0e0e0') in attrs: | |
self.in_meta_tr = True | |
else: | |
self.in_answer_tr = True | |
elif tag == 'td': | |
## The first td without attr contains cns id | |
if self.in_meta_tr and len(attrs) == 0: | |
self.in_meta_td = True | |
# The td with attr rowspan="1" contains student id | |
if self.in_meta_tr and ('rowspan', '1') in attrs: | |
self.in_meta_td = True | |
# Only one td in answer tr | |
if self.in_answer_tr: | |
self.in_answer_td = True | |
elif tag == 'ul': | |
self.in_comment_ul = True | |
# Initialize the comment string and grade for multiline comments | |
self.comment_str = '' | |
self.last_read_grade = '' | |
elif tag == 'li' and self.in_comment_ul: | |
self.in_comment_li = True | |
if tag == 'div' and ('class', 'ja') in attrs: | |
self.in_div_class_ja = True | |
elif tag == 'h4' and ('class', 'one') in attrs: | |
self.in_h4_class_one = True | |
def handle_data(self, data): | |
## cns id | |
if self.in_meta_td and match(r"[st]\d{5}[a-z]{2}", data): | |
self.last_read_cns_id = data | |
# student id | |
if self.in_meta_td and match(r"\d{8}", data): | |
self.last_read_student_id = data | |
print self.last_read_student_id # to check duplicates | |
elif self.in_comment_li: | |
if self.comment_str: # If comment lines already stored | |
if search(r"[A-DL][+\-]?$", self.comment_str): | |
self.comment_str = data.encode('utf-8') + ' -> ' + self.comment_str | |
else: # When grade characters not yet inserted | |
self.comment_str = self.comment_str + data.encode('utf-8') | |
else: # When the first line observed | |
self.comment_str = data.encode('utf-8') + self.comment_str | |
m = search(r"[A-DL][+\-]?$", self.comment_str) | |
#if m and not self.last_read_grade: # if grading found in the comment and it's the first one | |
if m: # if grading found | |
self.last_read_grade = m.group() | |
# hw title | |
if self.in_div_class_ja and self.in_h4_class_one: | |
self.hw_title = data.split(u" ")[0].encode('utf-8') | |
def handle_endtag(self, tag): | |
if tag == 'table': | |
self.in_table = False | |
elif tag == 'tr': | |
if self.in_meta_tr: | |
self.in_meta_tr = False | |
else: | |
self.in_answer_tr = False | |
elif tag == 'td': | |
if self.in_meta_td: | |
self.in_meta_td = False | |
else: | |
self.in_answer_td = False | |
elif tag == 'ul': | |
# set the read comments and grade | |
self.grades[self.last_read_student_id] = [self.last_read_grade, self.comment_str] | |
self.in_comment_ul = False | |
elif tag == 'li': | |
self.in_comment_li = False | |
elif tag == 'div': | |
self.in_div_class_ja = False | |
elif tag == 'h4': | |
self.in_h4_class_one = False | |
def main(): | |
parser = HWParser() | |
try: | |
parser.feed(open('SFC-SFS.html').read().decode('euc-jisx0213')) | |
except AssertionError: | |
exit(0) | |
grades = parser.grades | |
hw_title = parser.hw_title | |
# Get the hw sequence number from title | |
hw_number = search(r"\d+", hw_title).group() | |
for l in open('meibo.txt'): # Contains a full list of student ids | |
student_id = l.strip() | |
# empty students are interpolated from the list | |
if student_id not in grades.keys(): | |
grades[student_id] = ['', ''] | |
# Write out the result to csv for a new sheet | |
with open('hw'+hw_number+'.csv', 'wb') as f: | |
fields = ['学籍番号', hw_title, 'comment'] | |
writer = csv.DictWriter(f, fieldnames=fields) | |
writer.writeheader() | |
for k, v in sorted(grades.iteritems()): | |
if k and v: | |
grade = v[0] if v[0] else 'D' | |
writer.writerow({'学籍番号': k, hw_title: grade, 'comment': v[1]}) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment