Created
August 29, 2013 20:15
-
-
Save kasparsd/6382878 to your computer and use it in GitHub Desktop.
Parsing HTML using querypath library and creating a JSON feed of all blog posts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require 'querypath/src/qp.php'; | |
$posts = array(); | |
$map = array( | |
'kl.' => ' ', | |
'.' => ' ', | |
'des' => 'dec', | |
'mai' => 'may', | |
'okt' => 'oct' | |
); | |
foreach ( glob('posts/*.html') as $file ) { | |
$html = file_get_contents( $file ); | |
$date = current( explode( '_', basename( $file ) ) ); | |
if ( ! is_numeric( $date ) || $date < 100000 ) | |
continue; | |
$tags = array(); | |
foreach( htmlqp( $html, '#main .meta .tags a' ) as $tag ) | |
$tags[] = $tag->text(); | |
$comments = array(); | |
foreach( htmlqp( $html, '#comments .comment' ) as $comment ) | |
$comments[] = array( | |
'comment_author' => strip_tags( $comment->find('.meta h4')->text() ), | |
'comment_author_url' => $comment->find('.meta h4 a')->attr('href'), | |
'comment_content' => $comment->find('.content')->text(), | |
'comment_date' => date( 'r', strtotime( str_replace( array_keys($map), array_values($map), $comment->find('.meta p')->text() ) ) ), | |
); | |
$post = array( | |
'post_title' => htmlqp($html, '.entry h2')->text(), | |
'post_content' => htmlqp( $html, '.content' )->children()->html(), | |
'post_date' => date( 'r', $date ), | |
'categories' => htmlqp( $html, '#main .meta:eq(3) li:first a' )->text(), | |
'tags' => $tags, | |
'comments' => $comments | |
); | |
$posts[] = $post; | |
/* | |
$item = array( | |
sprintf( '<title>%s</title>', $post['post_title'] ), | |
sprintf( '<pubDate>%s</pubDate>', $post['post_date'] ), | |
sprintf( '<content:encoded><![CDATA[%s]]></content:encoded>', $post['post_content'] ), | |
sprintf( '<category domain="category"><![CDATA[%s]]></category>', $post['categories'] ), | |
'<wp:post_type>post</wp:post_type>' | |
); | |
foreach ( $post['tags'] as $tag ) | |
$item[] = sprintf( '<category domain="post_tag"><![CDATA[%s]]></category>', $tag ); | |
foreach ( $post['comments'] as $comment ) | |
$item[] = sprintf( | |
'<wp:comment> | |
<wp:comment_author><![CDATA[%s]]></wp:comment_author> | |
<wp:comment_author_url>%s</wp:comment_author_url> | |
<wp:comment_date>%s</wp:comment_date> | |
<wp:comment_content><![CDATA[%s]]></wp:comment_content> | |
</wp:comment>', | |
$comment['comment_author'], | |
$comment['comment_author_url'], | |
$comment['comment_date'], | |
$comment['comment_content'] | |
); | |
$items[] = sprintf( '<item>%s</item>', implode( "\n", $item ) ); | |
*/ | |
} | |
file_put_contents( 'export.json', json_encode( $posts ) ); | |
echo 'DONE'; | |
/* | |
// RSS export | |
file_put_contents( | |
'export.xml', | |
sprintf( | |
'<?xml version="1.0" encoding="UTF-8"?> | |
<rss> | |
<wp:wxr_version>1.2</wp:wxr_version> | |
<channel>%s</channel> | |
</rss>', | |
implode( "\n", $items ) | |
) | |
); | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment