BigW Consortium Gitlab

Commit 39d60294 by Forest Godfrey

Python post parser to take Tumblr XML and schedule it with Hootsuite

parent eeea7342
import datetime
from datetime import timedelta
import xml.etree.ElementTree as ET
import re
def strip_html(text):
return re.sub("<.*?>", "", text)
def strip_commas(text):
return re.sub(",", "", text)
def remove_non_ascii(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
def process_post(post):
urlstring = ""
urlwidth = 0
for url in post.findall('photo-url'):
try:
if int(url.get('max-width')) > urlwidth:
urlwidth = int(url.get('max-width'))
urlstring = url.text
except Exception:
pass
caption = ""
for c in post.findall('photo-caption'):
caption = c.text
break
return urlstring, remove_non_ascii(strip_commas(strip_html(caption)))
tree = ET.parse('/tmp/posts.xml')
root = tree.getroot()
d = datetime.datetime(2019, 2, 21, 17, 0)
t = timedelta(0,30*60,0)
for post in root.findall('post'):
post_time = d.strftime("%Y/%m/%d %H:%M")
post_url, post_text = process_post(post)
d = d + t
print "%s,%s,%s" % (post_time, post_text, post_url)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment