lily solr hbase indexer:添加索引器indexdemo-indexer.xml

时间:2017-04-12 16:14:32

标签: hadoop solr hbase bigdata

我正在使用solrlily根据' hadoop',' bigdata','等关键字对索引进行索引和生成搜索结果计算机科学&等,关于twitter数据,存储在hbase

hbase中的一行如下:

838720557562609665:1488801538:180782707:   column=json:tweetJSON, timestamp=1488801607097, value={"created_at":"Mon Mar 06 11:58:58 +0000 2017","id":838720557562609665,"i
                                            d_str":"838720557562609665","text":"RT @eraser: #Blockchain Technology Breakdown [img] by @FollowMyVote  #fintech #BigData #IoT
                                             #insurtech #cryptocurrency\x5Cu2026 ","source":"\x5Cu003ca href=\x5C"https:\x5C/\x5C/about.twitter.com\x5C/products\x5C/tweetd
                                            eck\x5C" rel=\x5C"nofollow\x5C"\x5Cu003eTweetDeck\x5Cu003c\x5C/a\x5Cu003e","truncated":false,"in_reply_to_status_id":null,"in_r
                                            eply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"i
                                            d":180782707,"id_str":"180782707","name":"bitiji","screen_name":"bitiji","location":"Zevilla ciberespaci\x5Cu00e1","url":"http:
                                            \x5C/\x5C/bitiji.com","description":"Nac\x5Cu00ed, crec\x5Cu00ed, me viraliz\x5Cu00e9 y  mor\x5Cu00ed... y vuerta a empez\x5Cu0
                                            0e1. hacia el infinito y + all\x5Cu00e1  .\x5Cr\x5CnMatria del eco(NO)sistema @bitiji","protected":false,"verified":false,"foll
                                            owers_count":964,"friends_count":700,"listed_count":157,"favourites_count":124,"statuses_count":17870,"created_at":"Fri Aug 20 
                                            13:31:49 +0000 2010","utc_offset":3600,"time_zone":"Madrid","geo_enabled":true,"lang":"es","contributors_enabled":false,"is_tra
                                            nslator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile_bac
                                            kground_images\x5C/162532234\x5C/bitiji_avatartwitter.png","profile_background_image_url_https":"https:\x5C/\x5C/pbs.twimg.com\
                                            x5C/profile_background_images\x5C/162532234\x5C/bitiji_avatartwitter.png","profile_background_tile":true,"profile_link_color":"
                                            0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_us
                                            e_background_image":true,"profile_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C/2859966744\x5C/2f056cd86881e4
                                            91f42c4bd942f5c5be_normal.png","profile_image_url_":"\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C/2859966744\x5C/2
                                            f056cd86881e491f42c4bd942f5c5be_normal.png","profile_banner_url":":\x5C/\x5C/pbs.twimg.com\x5C/profile_banners\x5C/1807827
                                            07\x5C/1398242563","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notificat
                                            ions":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Mar 01 13:35:0
                                            5 +0000 2017","id":836932805879820288,"id_str":"836932805879820288","text":"#Blockchain Technology Breakdown [img] by @FollowMy
                                            Vote  #fintech #BigData #IoT #insurtech #cryptocurrency\x5Cu2026 :\x5C/\x5C/t.co\x5C/KuYmu4lh8A","display_text_range":[0,1
                                            40],"source":"\x5Cu003ca href=\x5C"http:\x5C/\x5C/www.hootsuite.com\x5C" rel=\x5C"nofollow\x5C"\x5Cu003eHootsuite\x5Cu003c\x5C/
                                            a\x5Cu003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply
                                            _to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":3122211,"id_str":"3122211","name":"eraser ju\x5Cu24b6njo * \x
                                            5Cu2718 \x5Cu2605","screen_name":"eraser","location":"Sevilla","url":"http:\x5C/\x5C/e-learning-teleformacion.blogspot.com","de
                                            scription":"PhD student @fceyeUS @unisevilla elige la clave dela vida abcchdefghij... \x5Cu2718\x5Cu24d4-\x5Cu24dd\x5Cu24d0\x5C
                                            u24e4\x5Cu24e3\x5Cu24d0, \x5Cu24d4-\x5Cu24dc\x5Cu24d4\x5Cu24dd\x5Cu24e3\x5Cu24d4 Sevilla \x5Cu2605 elearning \x5Cu2605\x5Cu24b6
                                            r\x5Cu24e3\x5Cu2605 education \x5Cu2605 P2P \x5Cu2605 blockchain \x5Cu2605 economy","protected":false,"verified":false,"followe
                                            rs_count":21208,"friends_count":11566,"listed_count":2074,"favourites_count":4946,"statuses_count":474839,"created_at":"Sun Apr
                                             01 12:12:45 +0000 2007","utc_offset":3600,"time_zone":"Madrid","geo_enabled":true,"lang":"en","contributors_enabled":false,"is
                                            _translator":false,"profile_background_color":"9AE4E8","profile_background_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile
                                            _background_images\x5C/880489560\x5C/e145d4701fc8ad1b84d114cc2fd7c996.jpeg","profile_background_image_url_https":"https:\x5C/\x
                                            5C/pbs.twimg.com\x5C/profile_background_images\x5C/880489560\x5C/e145d4701fc8ad1b84d114cc2fd7c996.jpeg","profile_background_til
                                            e":true,"profile_link_color":"0000FF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E0FF92","profile_te
                                            xt_color":"000000","profile_use_background_image":true,"profile_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C
                                            /599157674337509376\x5C/0ZRJcLhV_normal.jpg","profile_image_url_https":"https:\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C/59
                                            9157674337509376\x5C/0ZRJcLhV_normal.jpg","profile_banner_url":"https:\x5C/\x5C/pbs.twimg.com\x5C/profile_banners\x5C/3122211\x
                                            5C/1438841267","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications
                                            ":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"#
                                            Blockchain Technology Breakdown [img] by @FollowMyVote  #fintech #BigData #IoT #insurtech #cryptocurrency #smartcities #DeepLea
                                            rning https:\x5C/\x5C/t.co\x5C/PpbSfk3Dta","display_text_range":[0,133],"entities":{"hashtags":[{"text":"Blockchain","indices":
                                            [0,11]},{"text":"fintech","indices":[57,65]},{"text":"BigData","indices":[66,74]},{"text":"IoT","indices":[75,79]},{"text":"ins
                                            urtech","indices":[80,90]},{"text":"cryptocurrency","indices":[91,106]},{"text":"smartcities","indices":[107,119]},{"text":"Dee
                                            pLearning","indices":[120,133]}],"urls":[],"user_mentions":[{"screen_name":"FollowMyVote","name":"FollowMyVote","id":392924202,
                                            "id_str":"392924202","indices":[42,55]}],"symbols":[],"media":[{"id":836932802947973120,"id_str":"836932802947973120","indices"
                                            :[134,157],"media_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","media_url_https":"https:\x5C/\x5C/pbs.
                                            twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","url":"https:\x5C/\x5C/t.co\x5C/PpbSfk3Dta","display_url":"pic.twitter.com\x5C/Ppb
                                            Sfk3Dta","expanded_url":"https:\x5C/\x5C/twitter.com\x5C/eraser\x5C/status\x5C/836932805879820288\x5C/photo\x5C/1","type":"phot
                                            o","sizes":{"large":{"w":800,"h":2000,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":480,"h":1200,"res
                                            ize":"fit"},"small":{"w":272,"h":680,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":836932802947973120,"id_str":"83693
                                            2802947973120","indices":[134,157],"media_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","media_url_http
                                            s":"https:\x5C/\x5C/pbs.twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","url":"https:\x5C/\x5C/t.co\x5C/PpbSfk3Dta","display_url":
                                            "pic.twitter.com\x5C/PpbSfk3Dta","expanded_url":"https:\x5C/\x5C/twitter.com\x5C/eraser\x5C/status\x5C/836932805879820288\x5C/p
                                            hoto\x5C/1","type":"photo","sizes":{"large":{"w":800,"h":2000,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium
                                            ":{"w":480,"h":1200,"resize":"fit"},"small":{"w":272,"h":680,"resize":"fit"}}}]}},"retweet_count":15,"favorite_count":5,"entiti
                                            es":{"hashtags":[{"text":"Blockchain","indices":[0,11]},{"text":"fintech","indices":[57,65]},{"text":"BigData","indices":[66,74
                                            ]},{"text":"IoT","indices":[75,79]},{"text":"insurtech","indices":[80,90]},{"text":"cryptocurrency","indices":[91,106]}],"urls"
                                            :[{"url":"https:\x5C/\x5C/t.co\x5C/KuYmu4lh8A","expanded_url":"https:\x5C/\x5C/twitter.com\x5C/i\x5C/web\x5C/status\x5C/8369328
                                            05879820288","display_url":"twitter.com\x5C/i\x5C/web\x5C/status\x5C/8\x5Cu2026","indices":[108,131]}],"user_mentions":[{"scree
                                            n_name":"FollowMyVote","name":"FollowMyVote","id":392924202,"id_str":"392924202","indices":[42,55]}],"symbols":[]},"favorited":
                                            false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,
                                            "favorite_count":0,"entities":{"hashtags":[{"text":"Blockchain","indices":[12,23]},{"text":"fintech","indices":[69,77]},{"text"
                                            :"BigData","indices":[78,86]},{"text":"IoT","indices":[87,91]},{"text":"insurtech","indices":[92,102]},{"text":"cryptocurrency"
                                            ,"indices":[103,118]}],"urls":[{"url":"","expanded_url":null,"indices":[120,120]}],"user_mentions":[{"screen_name":"eraser","na
                                            me":"eraser ju\x5Cu24b6njo * \x5Cu2718 \x5Cu2605","id":3122211,"id_str":"3122211","indices":[3,10]},{"screen_name":"FollowMyVot
                                            e","name":"FollowMyVote","id":392924202,"id_str":"392924202","indices":[54,67]}],"symbols":[]},"favorited":false,"retweeted":fa
                                            lse,"filter_level":"low","lang":"en","timestamp_ms":"1488801538254"}\x0D\x0A                                                   
 838720557562609665:1488801538:180782707:   column=tweetdata:coordinates, timestamp=1488801607097, value=NA                                                                
 838720557562609665:1488801538:180782707:   column=tweetdata:created_at, timestamp=1488801607097, value=1488801538                                                         
 838720557562609665:1488801538:180782707:   column=tweetdata:created_time_lucene, timestamp=1488801607097, value=2017-03-06T11:58:58Z                                      
 838720557562609665:1488801538:180782707:   column=tweetdata:hashtags, timestamp=1488801607097, value=Blockchain, fintech, BigData, IoT, insurtech, cryptocurrency         
 838720557562609665:1488801538:180782707:   column=tweetdata:id, timestamp=1488801607097, value=838720557562609665                                                         
 838720557562609665:1488801538:180782707:   column=tweetdata:in_reply_to_screen_name, timestamp=1488801607097, value=NA                                                    
 838720557562609665:1488801538:180782707:   column=tweetdata:in_reply_to_status_id, timestamp=1488801607097, value=NA                                                      
 838720557562609665:1488801538:180782707:   column=tweetdata:in_reply_to_user_id, timestamp=1488801607097, value=NA                                                        
 838720557562609665:1488801538:180782707:   column=tweetdata:place, timestamp=1488801607097, value=NA                                                                      
 838720557562609665:1488801538:180782707:   column=tweetdata:retweeted_status_id, timestamp=1488801607097, value=836932805879820288                                        
 838720557562609665:1488801538:180782707:   column=tweetdata:retweeted_status_text, timestamp=1488801607097, value=#Blockchain Technology Breakdown [img] by @FollowMyVote 
                                             #fintech #BigData #IoT #insurtech #cryptocurrency\xE2\x80\xA6                                     
 838720557562609665:1488801538:180782707:   column=tweetdata:retweeted_status_user_id, timestamp=1488801607097, value=3122211                                              
 838720557562609665:1488801538:180782707:   column=tweetdata:retweeted_status_user_name, timestamp=1488801607097, value=eraser ju\xE2\x92\xB6njo * \xE2\x9C\x98 \xE2\x98\x8
                                            5                                                                                                                              
 838720557562609665:1488801538:180782707:   column=tweetdata:source, timestamp=1488801607097, value=<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">T
                                            weetDeck</a>                                                                                                                   
 838720557562609665:1488801538:180782707:   column=tweetdata:text, timestamp=1488801607097, value=RT @eraser: #Blockchain Technology Breakdown [img] by @FollowMyVote  #fin
                                            tech #BigData #IoT #insurtech #cryptocurrency\xE2\x80\xA6                                                                      
 838720557562609665:1488801538:180782707:   column=tweetdata:urls, timestamp=1488801607097, value=                                                                         
 838720557562609665:1488801538:180782707:   column=tweetdata:usermentions, timestamp=1488801607097, value=eraser, FollowMyVote                                             
 838720557562609665:1488801538:180782707:   column=user:followers_count, timestamp=1488801607097, value=964                                                                
 838720557562609665:1488801538:180782707:   column=user:following_count, timestamp=1488801607097, value=NA                                                                 
 838720557562609665:1488801538:180782707:   column=user:friends_count, timestamp=1488801607097, value=700                                                                  
 838720557562609665:1488801538:180782707:   column=user:id, timestamp=1488801607097, value=180782707                                                                       
 838720557562609665:1488801538:180782707:   column=user:profile_image_url, timestamp=1488801607097, value=http://pbs.twimg.com/profile_images/2859966744/2f056cd86881e491f4
                                            2c4bd942f5c5be_normal.png                                                                                                      
 838720557562609665:1488801538:180782707:   column=user:screen_name, timestamp=1488801607097, value=bitiji                                                                 
 838720557562609665:1488801538:180782707:   column=user:timezone, timestamp=1488801607097, value=Madrid 

我已经能够设置solr和lily,但只剩下最后一步了,adding an indexer

./bin/hbase-indexer add-indexer -n myindexer -c indexdemo-indexer.xml \
    -cp solr.zk=localhost:2181/solr -cp solr.collection=collection1

对于上述内容,我需要制作indexdemo-indexer.xml文件。样品:

<?xml version="1.0"?>
<indexer table="indexdemo-user">
  <field name="firstname_s" value="info:firstname"/>
  <field name="lastname_s" value="info:lastname"/>
  <field name="age_i" value="info:age" type="int"/>
</indexer>

如何为我的数据制作上述文件(上面提到的hbase样本行)? (注意 - 我可以使用:column = tweetdata:hashtags或column = tweetdata:我猜这个文字,但是怎么样?)

1 个答案:

答案 0 :(得分:0)

以下是一种方法:

<?xml version="1.0"?>
<indexer table="indextweet">
  <field name="id" value="tweetdata:id"/>
  <field name="created_at" value="tweetdata:created_at"/>
  <field name="created_time_lucene" value="tweetdata:created_time_lucene"/>
  <field name="text" value="tweetdata:text"/>
  <field name="source" value="tweetdata:source"/>
  <field name="in_reply_to_status_id" value="tweetdata:in_reply_to_status_id"/>
  <field name="in_reply_to_user_id" value="tweetdata:in_reply_to_user_id"/>
  <field name="in_reply_to_screen_name" value="tweetdata:in_reply_to_screen_name"/>
  <field name="coordinates" value="tweetdata:coordinates"/>
  <field name="place" value="tweetdata:place"/>
  <field name="retweeted_status_id" value="tweetdata:retweeted_status_id"/>
  <field name="retweeted_status_text" value="tweetdata:retweeted_status_text"/>
  <field name="retweeted_status_user_id" value="tweetdata:retweeted_status_user_id"/>
  <field name="retweeted_status_user_name" value="tweetdata:retweeted_status_user_name"/>
  <field name="hashtags" value="tweetdata:hashtags"/>
  <field name="urls" value="tweetdata:urls"/>
  <field name="usermentions" value="tweetdata:usermentions"/>
  <field name="userid" value="user:id"/>
  <field name="screen_name" value="user:screen_name"/>
  <field name="timezone" value="user:timezone"/>
  <field name="followers_count" value="user:followers_count"/>
  <field name="friends_count" value="user:friends_count"/>
  <field name="following_count" value="user:following_count"/>
  <field name="profile_image_url" value="user:profile_image_url"/>
</indexer>