用Python解析自定义Twitter数据并生成统计信息
我正在尝试从一个特定的数据集中收集Twitter的统计信息,这个数据集是别人提供给我的。我无法控制数据在给我之前是如何格式化的,所以我只能处理这种杂乱的格式。
我希望能得到一些建议,如何用Python编写一个程序来解析这种输入,并输出成类似CSV文件的格式,字段标题作为表头,下面是对应的值。
我想用Python,因为我最终想使用一些我已经准备好的统计工具。
另外,我希望输出CSV格式,因为我可能会把它输入到像SPSS这样的工具中进行统计验证。
这是数据集中一个帖子样本的样子:
{"text":"A gente todos os dias arruma os cabelos: por que não o coração?","contributors":null,"geo":null,"retweeted":false,"in_reply_to_screen_name":null,"truncated":false,"entities":{"urls":[],"hashtags":[],"user_mentions":[]},"in_reply_to_status_id_str":null,"id":50270714498002945,"source":"web","in_reply_to_user_id_str":null,"favorited":false,"in_reply_to_status_id":null,"created_at":"Tue Mar 22 19:00:46 +0000 2011","in_reply_to_user_id":null,"retweet_count":0,"id_str":"50270714498002945","place":null,"user":{"location":"Brasil, Recife-PE","statuses_count":16,"profile_background_tile":true,"lang":"en","profile_link_color":"867c5f","id":59154474,"following":null,"favourites_count":0,"protected":false,"profile_text_color":"91957f","verified":false,"contributors_enabled":false,"description":"","profile_sidebar_border_color":"eae2bc","name":"Natalia Aráujo","profile_background_color":"eae2bc","created_at":"Wed Jul 22 15:27:15 +0000 2009","followers_count":10,"geo_enabled":false,"profile_background_image_url":"http://a3.twimg.com/profile_background_images/220796682/music-2.png","follow_request_sent":null,"url":null,"utc_offset":-10800,"time_zone":"Brasilia","notifications":null,"profile_use_background_image":true,"friends_count":18,"profile_sidebar_fill_color":"eae2bc","screen_name":"nat_araujo","id_str":"59154474","show_all_inline_media":false,"profile_image_url":"http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG","listed_count":1,"is_translator":false},"coordinates":null}
这个数据集是一整行,没有换行符分隔帖子。实际帖子之间唯一的分隔符是:
所有帖子都以
{"text":
开头,并以
null}
结尾,当然我很乐意与大家分享我的结果。
根据大家的建议,我开始写了以下代码:
import sys
import json
from pprint import pprint
if len(sys.argv) != 2:
print 'To Use: twitterjson2cvs.py (path/filename)'
sys.exit()
inputfile = open(sys.argv[1])
jsondatain = json.load(inputfile)
pprint(jsondatain)
inputfile.close()
这段代码输出的结果稍微干净一些,格式如下:
{u'contributors': None,
u'coordinates': None,
u'created_at': u'Tue Mar 22 19:00:46 +0000 2011',
u'entities': {u'hashtags': [], u'urls': [], u'user_mentions': []},
u'favorited': False,
u'geo': None,
u'id': 50270714498002945L,
u'id_str': u'50270714498002945',
u'in_reply_to_screen_name': None,
u'in_reply_to_status_id': None,
u'in_reply_to_status_id_str': None,
u'in_reply_to_user_id': None,
u'in_reply_to_user_id_str': None,
u'place': None,
u'retweet_count': 0,
u'retweeted': False,
u'source': u'web',
u'text': u'A gente todos os dias arruma os cabelos: por que n\xe3o o cora\xe7\xe3o?',
u'truncated': False,
u'user': {u'contributors_enabled': False,
u'created_at': u'Wed Jul 22 15:27:15 +0000 2009',
u'description': u'',
u'favourites_count': 0,
u'follow_request_sent': None,
u'followers_count': 10,
u'following': None,
u'friends_count': 18,
u'geo_enabled': False,
u'id': 59154474,
u'id_str': u'59154474',
u'is_translator': False,
u'lang': u'en',
u'listed_count': 1,
u'location': u'Brasil, Recife-PE',
u'name': u'Natalia Ar\xe1ujo',
u'notifications': None,
u'profile_background_color': u'eae2bc',
u'profile_background_image_url': u'http://a3.twimg.com/profile_background_images/220796682/music-2.png',
u'profile_background_tile': True,
u'profile_image_url': u'http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG',
u'profile_link_color': u'867c5f',
u'profile_sidebar_border_color': u'eae2bc',
u'profile_sidebar_fill_color': u'eae2bc',
u'profile_text_color': u'91957f',
u'profile_use_background_image': True,
u'protected': False,
u'screen_name': u'nat_araujo',
u'show_all_inline_media': False,
u'statuses_count': 16,
u'time_zone': u'Brasilia',
u'url': None,
u'utc_offset': -10800,
u'verified': False}
我在之前的代码上做了一些修改,试图输出到CSV文件:
import sys
import json
#from pprint import pprint
import csv
if len(sys.argv) != 2:
print 'To Use: twitterjson2cvs.py (path/filename)'
sys.exit()
inputfile = open(sys.argv[1])
jsondatain = json.load(inputfile)
f=csv.writer(open("test.csv","wb+"))
f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"])
for x in jsondatain:
f.writerow([x["contributors"],x["fields"]["coordinates"],x["fields"]["created_at"],x["fields"]["entities"],x["fields"]["hashtags"],x["fields"]["urls"],x["fields"]["user_mentions"],x["fields"]["favorited"],x["fields"]["geo"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["in_reply_to_screen_name"],x["fields"]["in_reply_to_status_id"],x["fields"]["in_reply_to_status_id_str"],x["fields"]["in_reply_to_user_id"],x["fields"]["in_reply_to_user_id_str"],x["fields"]["place"],x["fields"]["retweet_count"],x["fields"]["retweeted"],x["fields"]["source"],x["fields"]["text"],x["fields"]["truncated"],x["fields"]["user"],x["fields"]["contributors_enabled"],x["fields"]["created_at"],x["fields"]["description"],x["fields"]["favourites_count"],x["fields"]["follow_request_sent"],x["fields"]["followers_count"],x["fields"]["following"],x["fields"]["friends_count"],x["fields"]["geo_enabled"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["is_translator"],x["fields"]["lang"],x["fields"]["listed_count"],x["fields"]["location"],x["fields"]["name"],x["fields"]["notifications"],x["fields"]["profile_background_color"],x["fields"]["profile_background_image_url"],x["fields"]["profile_background_tile"],x["fields"]["profile_image_url"],x["fields"]["profile_link_color"],x["fields"]["profile_sidebar_border_color"],x["fields"]["profile_sidebar_fill_color"],x["fields"]["profile_text_color"],x["fields"]["profile_use_background_image"],x["fields"]["protected"],x["fields"]["screen_name"],x["fields"]["show_all_inline_media"],x["fields"]["statuses_count"],x["fields"]["time_zone"],x["fields"]["url"],x["fields"]["utc_offset"],x["fields"]["verified"]])
#pprint(jsondatain)
inputfile.close()
但是当我运行它时,我得到了:
File "twitterjson2cvs.py", line 28, in <module>
f.writerow([x["contributors"],x["fields"]["coordinates"],x["fields"]["created_at"],x["fields"]["entities"],x["fields"]["hashtags"],x["fields"]["urls"],x["fields"]["user_mentions"],x["fields"]["favorited"],x["fields"]["geo"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["in_reply_to_screen_name"],x["fields"]["in_reply_to_status_id"],x["fields"]["in_reply_to_status_id_str"],x["fields"]["in_reply_to_user_id"],x["fields"]["in_reply_to_user_id_str"],x["fields"]["place"],x["fields"]["retweet_count"],x["fields"]["retweeted"],x["fields"]["source"],x["fields"]["text"],x["fields"]["truncated"],x["fields"]["user"],x["fields"]["contributors_enabled"],x["fields"]["created_at"],x["fields"]["description"],x["fields"]["favourites_count"],x["fields"]["follow_request_sent"],x["fields"]["followers_count"],x["fields"]["following"],x["fields"]["friends_count"],x["fields"]["geo_enabled"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["is_translator"],x["fields"]["lang"],x["fields"]["listed_count"],x["fields"]["location"],x["fields"]["name"],x["fields"]["notifications"],x["fields"]["profile_background_color"],x["fields"]["profile_background_image_url"],x["fields"]["profile_background_tile"],x["fields"]["profile_image_url"],x["fields"]["profile_link_color"],x["fields"]["profile_sidebar_border_color"],x["fields"]["profile_sidebar_fill_color"],x["fields"]["profile_text_color"],x["fields"]["profile_use_background_image"],x["fields"]["protected"],x["fields"]["screen_name"],x["fields"]["show_all_inline_media"],x["fields"]["statuses_count"],x["fields"]["time_zone"],x["fields"]["url"],x["fields"]["utc_offset"],x["fields"]["verified"]])
TypeError: string indices must be integers
这个错误似乎与字段的格式有关,但我看不出问题出在哪里。
我更新了代码,以反映你们的格式建议,修改如下:
import sys
import json
import csv
if len(sys.argv) != 2:
print 'To Use: twitterjson2cvs.py (path/filename)'
sys.exit()
inputfile = open(sys.argv[1])
jsondatain = json.load(inputfile)
f=csv.writer(open("test.csv","wb+"))
f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"])
for x in jsondatain:
f.writerow(
(
x['contributors'],
x['coordinates'],
x['created_at'],
x['entities']['hashtags'],
x['entities']['urls'],
x['entities']['user_mentions'],
x['favorited'],
x['geo'],
x['id'],
x['id_str'],
x['in_reply_to_screen_name'],
x['in_reply_to_status_id'],
x['in_reply_to_status_id_str'],
x['in_reply_to_user_id'],
x['in_reply_to_user_id_str'],
x['place'],
x['retweet_count'],
x['retweeted'],
x['source'],
x['text'].encode('utf8'),
x['truncated'],
x['user']['contributors_enabled'],
x['user']['created_at'],
x['user']['description'],
x['user']['favourites_count'],
x['user']['follow_request_sent'],
x['user']['followers_count'],
x['user']['following'],
x['user']['friends_count'],
x['user']['geo_enabled'],
x['user']['id'],
x['user']['id_str'],
x['user']['is_translator'],
x['user']['lang'],
x['user']['listed_count'],
x['user']['location'],
x['user']['name'].encode('utf8'),
x['user']['notifications'],
x['user']['profile_background_color'],
x['user']['profile_background_image_url'],
x['user']['profile_background_tile'],
x['user']['profile_image_url'],
x['user']['profile_link_color'],
x['user']['profile_sidebar_border_color'],
x['user']['profile_sidebar_fill_color'],
x['user']['profile_text_color'],
x['user']['profile_use_background_image'],
x['user']['protected'],
x['user']['screen_name'],
x['user']['show_all_inline_media'],
x['user']['statuses_count'],
x['user']['time_zone'],
x['user']['url'],
x['user']['utc_offset'],
x['user']['verified']
)
)
inputfile.close()
但我仍然遇到以下错误:
twitterjson2cvs.py TweetFile1300820340639.tcm.online
Traceback (most recent call last):
File "workspace/coalmine-datafilter/src/twitterjson2csv.py", line 30, in <module>
x['contributors'],
TypeError: string indices must be integers
到目前为止,单个JSON格式的输入文件一切正常。之前的例子中输入的JSON字符串:
import sys
import json
import csv
if len(sys.argv) != 2:
print 'To Use: twitterjson2cvs.py (path/filename)'
sys.exit()
inputfile = open(sys.argv[1])
jsonindata = json.load(inputfile)
f=csv.writer(open("test.csv","wb+"))
f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"])
f.writerow(
(
jsonindata['contributors'],
jsonindata['coordinates'],
jsonindata['created_at'],
jsonindata['entities']['hashtags'],
jsonindata['entities']['urls'],
jsonindata['entities']['user_mentions'],
jsonindata['favorited'],
jsonindata['geo'],
jsonindata['id'],
jsonindata['id_str'],
jsonindata['in_reply_to_screen_name'],
jsonindata['in_reply_to_status_id'],
jsonindata['in_reply_to_status_id_str'],
jsonindata['in_reply_to_user_id'],
jsonindata['in_reply_to_user_id_str'],
jsonindata['place'],
jsonindata['retweet_count'],
jsonindata['retweeted'],
jsonindata['source'],
jsonindata['text'].encode('utf8'),
jsonindata['truncated'],
jsonindata['user']['contributors_enabled'],
jsonindata['user']['created_at'],
jsonindata['user']['description'],
jsonindata['user']['favourites_count'],
jsonindata['user']['follow_request_sent'],
jsonindata['user']['followers_count'],
jsonindata['user']['following'],
jsonindata['user']['friends_count'],
jsonindata['user']['geo_enabled'],
jsonindata['user']['id'],
jsonindata['user']['id_str'],
jsonindata['user']['is_translator'],
jsonindata['user']['lang'],
jsonindata['user']['listed_count'],
jsonindata['user']['location'],
jsonindata['user']['name'].encode('utf8'),
jsonindata['user']['notifications'],
jsonindata['user']['profile_background_color'],
jsonindata['user']['profile_background_image_url'],
jsonindata['user']['profile_background_tile'],
jsonindata['user']['profile_image_url'],
jsonindata['user']['profile_link_color'],
jsonindata['user']['profile_sidebar_border_color'],
jsonindata['user']['profile_sidebar_fill_color'],
jsonindata['user']['profile_text_color'],
jsonindata['user']['profile_use_background_image'],
jsonindata['user']['protected'],
jsonindata['user']['screen_name'],
jsonindata['user']['show_all_inline_media'],
jsonindata['user']['statuses_count'],
jsonindata['user']['time_zone'],
jsonindata['user']['url'],
jsonindata['user']['utc_offset'],
jsonindata['user']['verified']
)
)
inputfile.close()
生成了一个格式良好的输出,准备好供像SPSS这样的工具使用,格式如下:
contributors,coordinates,created_at,entities,hashtags,urls,user_mentions,favorited,geo,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,place,retweet_count,retweeted,source,text,truncated,user,contributors_enabled,created_at,description,favourites_count,follow_request_sent,followers_count,following,friends_count,geo_enabled,id,id_str,is_translator,lang,listed_count,location,name,notifications,profile_background_color,profile_background_image_url,profile_background_tile,profile_image_url,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,protected,screen_name,show_all_inline_media,statuses_count,time_zone,url,utc_offset,verified
,,Tue Mar 22 19:00:46 +0000 2011,[],[],[],False,,50270714498002945,50270714498002945,,,,,,,0,False,web,A gente todos os dias arruma os cabelos: por que não o coração?,False,False,Wed Jul 22 15:27:15 +0000 2009,,0,,10,,18,False,59154474,59154474,False,en,1,"Brasil, Recife-PE",Natalia Aráujo,,eae2bc,http://a3.twimg.com/profile_background_images/220796682/music-2.png,True,http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG,867c5f,eae2bc,eae2bc,91957f,True,False,nat_araujo,False,16,Brasilia,,-10800,False
现在唯一的问题是,我有一些输入文件,其中包含多个JSON字符串,它们都在同一行中连续排列。当我尝试在这些文件上运行相同的程序时,我得到了以下错误:
Traceback (most recent call last):
File "workspace/coalmine-datafilter/src/twitterjson2cvs.py", line 22, in <module>
jsonindata = json.load(inputfile)
File "/usr/lib/python2.6/json/__init__.py", line 267, in load
parse_constant=parse_constant, **kw)
File "/usr/lib/python2.6/json/__init__.py", line 307, in loads
return _default_decoder.decode(s)
File "/usr/lib/python2.6/json/decoder.py", line 322, in decode
raise ValueError(errmsg("Extra data", s, end, len(s)))
ValueError: Extra data: line 1 column 1514 - line 2 column 1 (char 1514 - 2427042)
输入文件非常大(例如:有几千条Twitter帖子),我不知道这个错误是由于帖子数量过多,还是因为文件中有多个{"...."}{"...."}
都在同一行。也许我需要在每个数据之间添加换行符?
2 个回答
这里的输入是JSON格式。Python有一个JSON模块,而且它还有一个CSV模块。所以你的输入和输出都可以搞定了!
更新
你快到了!
你调用的writerow()
方法需要看起来更像这样(而不是在一个for
循环里):
f.writerow(
(
jsonindata['contributors'],
jsonindata['coordinates'],
jsonindata['created_at'],
jsonindata['entities']['hashtags'],
jsonindata['entities']['urls'],
jsonindata['entities']['user_mentions'],
jsonindata['favorited'],
jsonindata['geo'],
jsonindata['id'],
jsonindata['id_str'],
jsonindata['in_reply_to_screen_name'],
jsonindata['in_reply_to_status_id'],
jsonindata['in_reply_to_status_id_str'],
jsonindata['in_reply_to_user_id'],
jsonindata['in_reply_to_user_id_str'],
jsonindata['place'],
jsonindata['retweet_count'],
jsonindata['retweeted'],
jsonindata['source'],
jsonindata['text'].encode('utf8'),
jsonindata['truncated'],
jsonindata['user']['contributors_enabled'],
jsonindata['user']['created_at'],
jsonindata['user']['description'],
jsonindata['user']['favourites_count'],
jsonindata['user']['follow_request_sent'],
jsonindata['user']['followers_count'],
jsonindata['user']['following'],
jsonindata['user']['friends_count'],
jsonindata['user']['geo_enabled'],
jsonindata['user']['id'],
jsonindata['user']['id_str'],
jsonindata['user']['is_translator'],
jsonindata['user']['lang'],
jsonindata['user']['listed_count'],
jsonindata['user']['location'],
jsonindata['user']['name'].encode('utf8'),
jsonindata['user']['notifications'],
jsonindata['user']['profile_background_color'],
jsonindata['user']['profile_background_image_url'],
jsonindata['user']['profile_background_tile'],
jsonindata['user']['profile_image_url'],
jsonindata['user']['profile_link_color'],
jsonindata['user']['profile_sidebar_border_color'],
jsonindata['user']['profile_sidebar_fill_color'],
jsonindata['user']['profile_text_color'],
jsonindata['user']['profile_use_background_image'],
jsonindata['user']['protected'],
jsonindata['user']['screen_name'],
jsonindata['user']['show_all_inline_media'],
jsonindata['user']['statuses_count'],
jsonindata['user']['time_zone'],
jsonindata['user']['url'],
jsonindata['user']['utc_offset'],
jsonindata['user']['verified']
)
)
另外,可以考虑使用DictWriter
,但要记住Python的CSV模块对Unicode处理得不好,所以在元组的几个元素上要加上.encode('utf8')
。
这段话是说,这个可以帮助你入门……不过你还需要处理一下里面的嵌套对象。
import json
import csv
f = file('test.json', 'r')
data = json.load(f)
#result = []
for k,v in data.iteritems():
print k,v
#result.append(v)
f = file('output.csv', 'w')
writer = csv.writer(f)
writer.writerows(result)