git.sur5r.net Git - i3/i3.github.io/blob - _controllers/blog/post.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 """
   5 post.py parses post sources from the ./_post directory.
   6 """
   7
   8 __author__ = "Ryan McGuire (ryan@enigmacurry.com)"
   9 __date__   = "Mon Feb  2 21:21:04 2009"
  10
  11 import os
  12 import sys
  13 import datetime
  14 import re
  15 import operator
  16 import urlparse
  17 import hashlib
  18 import codecs
  19
  20 import pytz
  21 import yaml
  22 import logging
  23 import BeautifulSoup
  24
  25 import blogofile_bf as bf
  26
  27 logger = logging.getLogger("blogofile.post")
  28
  29 config = bf.config.controllers.blog.post
  30 config.mod = sys.modules[globals()["__name__"]]
  31
  32 # These are all the Blogofile reserved field names for posts. It is not
  33 # recommended that users re-use any of these field names for purposes other
  34 # than the one stated.
  35 reserved_field_names = {
  36     "title"      :"A one-line free-form title for the post",
  37     "date"       :"The date that the post was originally created",
  38     "updated"    :"The date that the post was last updated",
  39     "categories" :"A list of categories that the post pertains to, "\
  40         "each seperated by commas",
  41     "tags"       :"A list of tags that the post pertains to, "\
  42         "each seperated by commas",
  43     "permalink"  :"The full permanent URL for this post. "\
  44         "Automatically created if not provided",
  45     "path"       :"The path from the permalink of the post",
  46     "guid"       :"A unique hash for the post, if not provided it "\
  47         "is assumed that the permalink is the guid",
  48     "slug"       :"The title part of the URL for the post, if not "\
  49         "provided it is automatically generated from the title."\
  50         "It is not used if permalink does not contain :title",
  51     "author"     :"The name of the author of the post",
  52     "filters"    :"The filter chain to apply to the entire post. "\
  53         "If not specified, a default chain based on the file extension is "\
  54         "applied. If set to 'None' it disables all filters, even default ones.",
  55     "filter"     :"synonym for filters",
  56     "draft"      :"If 'true' or 'True', the post is considered to be only a "\
  57         "draft and not to be published.",
  58     "source"     :"Reserved internally",
  59     "yaml"       :"Reserved internally",
  60     "content"    :"Reserved internally",
  61     "filename"   :"Reserved internally"
  62     }
  63
  64
  65 class PostParseException(Exception):
  66
  67     def __init__(self, value):
  68         self.value = value
  69
  70     def __str__(self):
  71         return repr(self.value)
  72
  73
  74 class Post(object):
  75     """
  76     Class to describe a blog post and associated metadata
  77     """
  78     def __init__(self, source, filename="Untitled"):
  79         self.source = source
  80         self.yaml = None
  81         self.title = None
  82         self.__timezone = bf.config.controllers.blog.timezone
  83         self.date = None
  84         self.updated = None
  85         self.categories = set()
  86         self.tags = set()
  87         self.permalink = None
  88         self.content = u""
  89         self.excerpt = u""
  90         self.filename = filename
  91         self.author = ""
  92         self.guid = None
  93         self.slug = None
  94         self.draft = False
  95         self.filters = None
  96         self.__parse()
  97         self.__post_process()
  98
  99     def __repr__(self): #pragma: no cover
 100         return u"<Post title='{0}' date='{1}'>".format(
 101             self.title, self.date.strftime("%Y/%m/%d %H:%M:%S"))
 102
 103     def __parse(self):
 104         """Parse the yaml and fill fields"""
 105         yaml_sep = re.compile("^---$", re.MULTILINE)
 106         content_parts = yaml_sep.split(self.source, maxsplit=2)
 107         if len(content_parts) < 2:
 108             raise PostParseException(u"{0}: Post has no YAML section".format(
 109                     self.filename))
 110         else:
 111             #Extract the yaml at the top
 112             self.__parse_yaml(content_parts[1])
 113             post_src = content_parts[2]
 114         self.__apply_filters(post_src)
 115         #Do post excerpting
 116         self.__parse_post_excerpting()
 117
 118     def __apply_filters(self, post_src):
 119         """Apply filters to the post"""
 120         #Apply block level filters (filters on only part of the post)
 121         # TODO: block level filters on posts
 122         #Apply post level filters (filters on the entire post)
 123         #If filter is unspecified, use the default filter based on
 124         #the file extension:
 125         if self.filters is None:
 126             try:
 127                 file_extension = os.path.splitext(self.filename)[-1][1:]
 128                 self.filters = bf.config.controllers.blog.post_default_filters[
 129                     file_extension]
 130             except KeyError:
 131                 self.filters = []
 132         self.content = bf.filter.run_chain(self.filters, post_src)
 133
 134     def __parse_post_excerpting(self):
 135         if bf.config.controllers.blog.post_excerpts.enabled:
 136             length = bf.config.controllers.blog.post_excerpts.word_length
 137             try:
 138                 self.excerpt = bf.config.post_excerpt(self.content, length)
 139             except AttributeError:
 140                 self.excerpt = self.__excerpt(length)
 141
 142     def __excerpt(self, num_words=50):
 143         #Default post excerpting function
 144         #Can be overridden in _config.py by
 145         #defining post_excerpt(content,num_words)
 146         if len(self.excerpt) == 0:
 147              """Retrieve excerpt from article"""
 148              s = BeautifulSoup.BeautifulSoup(self.content)
 149              # get rid of javascript, noscript and css
 150              [[tree.extract() for tree in s(elem)] for elem in (
 151                      'script', 'noscript', 'style')]
 152              # get rid of doctype
 153              subtree = s.findAll(text=re.compile("DOCTYPE|xml"))
 154              [tree.extract() for tree in subtree]
 155              # remove headers
 156              [[tree.extract() for tree in s(elem)] for elem in (
 157                      'h1', 'h2', 'h3', 'h4', 'h5', 'h6')]
 158              text = ''.join(s.findAll(text=True))\
 159                                  .replace("\n", "").split(" ")
 160              return " ".join(text[:num_words]) + '...'
 161
 162     def __post_process(self):
 163         # fill in empty default value
 164         if not self.title:
 165             self.title = u"Untitled - {0}".format(
 166                     datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
 167
 168         if not self.slug:
 169             self.slug = re.sub("[ ?]", "-", self.title).lower()
 170
 171         if not self.date:
 172             self.date = datetime.datetime.now(pytz.timezone(self.__timezone))
 173         if not self.updated:
 174             self.updated = self.date
 175
 176         if not self.categories or len(self.categories) == 0:
 177             self.categories = set([Category('Uncategorized')])
 178         if not self.permalink and \
 179                 bf.config.controllers.blog.auto_permalink.enabled:
 180             self.permalink = bf.config.site.url.rstrip("/") + \
 181                 bf.config.controllers.blog.auto_permalink.path
 182             self.permalink = \
 183                     re.sub(":blog_path", bf.config.blog.path, self.permalink)
 184             self.permalink = \
 185                     re.sub(":year", self.date.strftime("%Y"), self.permalink)
 186             self.permalink = \
 187                     re.sub(":month", self.date.strftime("%m"), self.permalink)
 188             self.permalink = \
 189                     re.sub(":day", self.date.strftime("%d"), self.permalink)
 190             self.permalink = \
 191                     re.sub(":title", self.slug, self.permalink)
 192
 193             # TODO: slugification should be abstracted out somewhere reusable
 194             self.permalink = re.sub(
 195                     ":filename", re.sub(
 196                             "[ ?]", "-", self.filename).lower(), self.permalink)
 197
 198             # Generate sha hash based on title
 199             self.permalink = re.sub(":uuid", hashlib.sha1(
 200                     self.title.encode('utf-8')).hexdigest(), self.permalink)
 201
 202         logger.debug(u"Permalink: {0}".format(self.permalink))
 203
 204     def __parse_yaml(self, yaml_src):
 205         y = yaml.load(yaml_src)
 206         # Load all the fields that require special processing first:
 207         fields_need_processing = ('permalink', 'guid', 'date', 'updated',
 208                                   'categories', 'tags', 'draft')
 209         try:
 210             self.permalink = y['permalink']
 211             if self.permalink.startswith("/"):
 212                 self.permalink = urlparse.urljoin(bf.config.site.url,
 213                         self.permalink)
 214             #Ensure that the permalink is for the same site as bf.config.site.url
 215             if not self.permalink.startswith(bf.config.site.url):
 216                 raise PostParseException(u"{0}: permalink for a different site"
 217                         " than configured".format(self.filename))
 218             logger.debug(u"path from permalink: {0}".format(self.path))
 219         except KeyError:
 220             pass
 221         try:
 222             self.guid = y['guid']
 223         except KeyError:
 224             self.guid = self.permalink
 225         try:
 226             self.date = pytz.timezone(self.__timezone).localize(
 227                 datetime.datetime.strptime(y['date'], config.date_format))
 228         except KeyError:
 229             pass
 230         try:
 231             self.updated = pytz.timezone(self.__timezone).localize(
 232                 datetime.datetime.strptime(y['updated'], config.date_format))
 233         except KeyError:
 234             pass
 235         try:
 236             self.categories = set([Category(x.strip()) for x in \
 237                                        y['categories'].split(",")])
 238         except:
 239             pass
 240         try:
 241             self.tags = set([x.strip() for x in y['tags'].split(",")])
 242         except:
 243             pass
 244         try:
 245             self.filters = y['filter'] #filter is a synonym for filters
 246         except KeyError:
 247             pass
 248         try:
 249             if y['draft']:
 250                 self.draft = True
 251                 logger.info(u"Post {0} is set to draft, "
 252                         "ignoring this post".format(self.filename))
 253             else:
 254                 self.draft = False
 255         except KeyError:
 256             self.draft = False
 257         # Load the rest of the fields that don't need processing:
 258         for field, value in y.items():
 259             if field not in fields_need_processing:
 260                 setattr(self,field,value)
 261
 262     def permapath(self):
 263         """Get just the path portion of a permalink"""
 264         return urlparse.urlparse(self.permalink)[2]
 265
 266     def __cmp__(self, other_post):
 267         "Posts should be comparable by date"
 268         return cmp(self.date, other_post.date)
 269
 270     def __eq__(self, other_post):
 271         return self is other_post
 272
 273     def __getattr__(self, name):
 274         if name == "path":
 275             #Always generate the path from the permalink
 276             return self.permapath()
 277         else:
 278             raise AttributeError, name
 279
 280
 281 class Category(object):
 282
 283     def __init__(self, name):
 284         self.name = unicode(name)
 285         # TODO: slugification should be abstracted out somewhere reusable
 286         # TODO: consider making url_name and path read-only properties?
 287         self.url_name = self.name.lower().replace(" ", "-")
 288         self.path = bf.util.site_path_helper(
 289                 bf.config.controllers.blog.path,
 290                 bf.config.controllers.blog.category_dir,
 291                 self.url_name)
 292
 293     def __eq__(self, other):
 294         if self.name == other.name:
 295             return True
 296         return False
 297
 298     def __hash__(self):
 299         return hash(self.name)
 300
 301     def __repr__(self):
 302         return self.name
 303
 304     def __cmp__(self, other):
 305         return cmp(self.name, other.name)
 306
 307
 308 def parse_posts(directory):
 309     """Retrieve all the posts from the directory specified.
 310
 311     Returns a list of the posts sorted in reverse by date."""
 312     posts = []
 313     post_filename_re = re.compile(
 314         ".*((\.textile$)|(\.markdown$)|(\.org$)|(\.html$)|(\.txt$)|(\.rst$))")
 315     if not os.path.isdir("_posts"):
 316         logger.warn("This site has no _posts directory.")
 317         return []
 318     post_paths = [f.decode("utf-8") for f in bf.util.recursive_file_list(
 319             directory, post_filename_re) if post_filename_re.match(f)]
 320
 321     for post_path in post_paths:
 322         post_fn = os.path.split(post_path)[1]
 323         logger.debug(u"Parsing post: {0}".format(post_path))
 324         #IMO codecs.open is broken on Win32.
 325         #It refuses to open files without replacing newlines with CR+LF
 326         #reverting to regular open and decode:
 327         try:
 328             src = open(post_path, "r").read().decode(
 329                     bf.config.controllers.blog.post_encoding)
 330         except:
 331             logger.exception(u"Error reading post: {0}".format(post_path))
 332             raise
 333         try:
 334             p = Post(src, filename=post_fn)
 335         except PostParseException as e:
 336             logger.warning(u"{0} : Skipping this post.".format(e.value))
 337             continue
 338         #Exclude some posts
 339         if not (p.permalink is None or p.draft is True):
 340             posts.append(p)
 341     posts.sort(key=operator.attrgetter('date'), reverse=True)
 342     return posts