谷歌api搜索结果

时间:2014-04-04 20:33:58

标签: python google-app-engine google-api

您好我正在尝试使用谷歌应用引擎创建一个学术项目。 我希望通过google api实现的主要功能是搜索用户输入的查询并返回结果然后我可以对结果进行进一步分析。 xgoogle是我的第一次尝试,但它有导入错误(我可以在本地运行相同的代码)与google api引擎。即使是xgoogle作品,我也不知道如何获取不同网站的内容。 我想知道是否有任何方法可以将谷歌搜索结果作为文件返回。谢谢。

代码部分使用正则表达式来查找包含在html代码中的内容,但它不适用于具有不同html布局的网站。谢谢。

#!/usr/bin/env python
from html import HTML_PAGE
import webapp2
import jinja2
import os
import re
import sys

from os import walk
from google.appengine.api import search
from google.appengine.ext import ndb
from urllib import urlopen
from cgi import parse_qs

from xgoogle.search import GoogleSearch, SearchError


page = HTML_PAGE()

class MainPage(webapp2.RequestHandler):
    def get(self):
        self.response.out.write(page.pageChange())




class SearchFile():
    def __init__(self,userInput=''):
        self.__input = userInput
        self.__result = {}
        self.__files = []
        self.__filenames =[]
        for (dirpath, dirnames, filenames) in walk("dataFolder"):
            for name in filenames:
                path = dirpath+"/"+name
                self.__files.append(path)
                self.__filenames.append(name)
            break
    def outPutData(self):
        iterator = range(0,len(self.__filenames))
        for i in iterator:
            with open(self.__files[i]) as f:
                for line in f:
                    if self.__input in line:
                        self.__result[self.__filenames[i]] = line
                        break
            f.close()
        return self.__result  


class SearchFileHandle(webapp2.RequestHandler):
    def post(self):
        userInput = str(self.request.get('input'))
        self.response.out.write(page.pageChange(userInput))        

        search = SearchFile(userInput)
        tramText = CropText()

        for key,value in search.outPutData().iteritems():
            keyBold = "<b>%s</b><br>"%(key)
            keyLink = "<a href = \"dataFolder/%s\" name =\"%s\"> %s </a>"%(key,key,keyBold)
            self.response.out.write(keyLink)      
            #print >>sys.stderr, "====>", re.search(regex, value,re.IGNORECASE)
            resultContain = tramText.tram(value,userInput)
            for word in resultContain.split(" "):
                if word in userInput:
                    for keyWord in userInput.split(" "):
                        if word == keyWord:
                            self.response.out.write(" <b>%s</b> "%(word))
                else:
                    self.response.out.write(" %s "%(word))
            self.response.out.write("<br><br><br>")    
        news =  TakeNews()
        for key,value in news.websiteRead(userInput).iteritems():
            keyBold = "<b>%s</b><br>"%(key)
            keyLink = "<a href = %s> %s </a>"%(value,keyBold)
            self.response.out.write(keyLink)
            self.response.out.write("<br><br><br>")


        googleSearch = WebSearch(userInput)
        results = googleSearch.returnResult()
        for res in results:
            self.response.out.write(res)
            self.response.out.write("<br><br><br>")


class CropText():
    def tram(self,text,word):
        regex = r"( .*? )"+re.escape(word)+r"( .*?\.)"
        #print >>sys.stderr, text
        if re.search(regex,text,re.IGNORECASE):
            return re.search(regex,text,re.IGNORECASE).group()
        else:
            return ''


class TakeNews():
    def __init__(self):
        self.__website = 'http://www.bloomberg.com'
        self.__topNews =''
        self.__topNewsTitle =''


    def setWebsite(self,website):
        if (website[:10] != 'http://www') and not('http://www' in website):
            website = 'http://www' + website
        self.__website =  website


    def websiteRead(self,userInput):
        webpage = urlopen(self.__website).read()  
        pathFinderTopNewsTitle = re.compile('<a class=\"icon-article-headline\".*<span class=\'headline\'>(.*)</span>')
        pathFinderTopNews = re.compile('<a class=\"icon-article-headline\" data-id=.* data-type=.* href=\"(.*)\"><span class=\'headline\'>')
        self.__topNewsTitle =  re.findall(pathFinderTopNewsTitle,webpage)
        self.__topNews = re.findall(pathFinderTopNews,webpage)
        result = {}
        iterator = range(0,len(self.__topNewsTitle))
        for i in iterator:
            if userInput in self.__topNews[i]:
                result[self.__topNewsTitle[i]] = self.__website+"/"+self.__topNews[i]
        return result

class WebSearch():
    def __init__(self,word):
        self.__search = word
    def returnResult(self):
        gs = GoogleSearch(self.__search)
        gs.results_per_page = 200
        return gs.get_results()


def main():
    app.run()


app = webapp2.WSGIApplication([('/',MainPage),
                            ('/searchFile',SearchFileHandle)
                            ],
                            debug =True)

if __name__ == "__main__":
    main()

1 个答案:

答案 0 :(得分:0)

将bs4模块添加到app的文件夹