使用Python抓取网站数据时的性能问题

时间:2015-05-22 03:15:51

标签: python performance web-scraping lxml python-requests

我正在尝试使用Python从包含大约4000页的网站中删除数据,每页包含25个链接。

我的问题是,在大约200个已处理的页面之后,性能变得非常可怕,以至于我的计算机上的其他程序也会冻结。

我想这是关于我没有正确使用内存或类似的东西。如果有人可以帮我解决这个问题,让我的脚本运行更顺畅,对我的系统要求不高,我将非常感激。

提前感谢您的帮助。 :)

修改 我找到了解决方案,你可以在我向下滚动一下时给出的答案中找到它。感谢所有试图帮助我的人,特别是etna和Walter A给了我很好的建议让我走上正轨。 :)

from pprint import pprint
from lxml import etree
import itertools
import requests

def function parsePageUrls(page):
    return page.xpath('//span[@class="tip"]/a/@href')

def function isLastPage(page):
    if not page.xpath('//a[@rel="next"]'):
        return True

urls = []
for i in itertools.count(1):
    content = requests.get('http://www.example.com/index.php?page=' + str(i), allow_redirects=False)
    page = etree.HTML(content.text)

    urls.extend(parsePageUrls(page))

    if isLastPage(page):
        break

pprint urls

1 个答案:

答案 0 :(得分:0)

我终于找到了解决方案。问题是我认为我使用字符串列表作为tree.xpath的返回值,而是它是一个_ElementUnicodeResult-Objects列表阻止GC清除内存,因为它们保存了对其父级的引用。

所以解决方案是将这些_ElementUnicodeResult-Objects转换为普通字符串以去除引用。

以下是帮助我理解问题的来源:http://lxml.de/api/lxml.etree._ElementTree-class.html#xpath

对于提供的代码,以下修复了它:

而不是:

Startup.Configure()

必须是:

public class SomethingWatchFaceService extends CanvasWatchFaceService {
private static final String TAG = "SomethingWatchFaceService";



@Override
public Engine onCreateEngine() {
    /* provide your watch face implementation */
    return new Engine();
}



/* implement service callback methods */
private class Engine extends CanvasWatchFaceService.Engine {

    Bitmap mBackgroundBitmap;
    Bitmap mBackgroundScaledBitmap;

    String[] backgroundColor = {"red", "green", "blue"};

    int let= new Random().nextInt(backgroundColor.length);
    String randomColor = (backgroundColor[let]);

    Integer[] listDrawable = {R.drawable.back1, R.drawable.back2};

    //Generating the list number for drawable
    Random randNumForDrawable = new Random();
    int  n = randNumForDrawable.nextInt(2);


    //Member variables
    private Typeface WATCH_TEXT_TYPEFACE = Typeface.create( Typeface.SERIF, Typeface.NORMAL );

    private static final int MSG_UPDATE_TIME_ID = 42;
    private long mUpdateRateMs = 1000;

    private Time mDisplayTime;

    private Paint mBackgroundColorPaint;
    private Paint mTextColorPaint;

    private boolean mHasTimeZoneReceiverBeenRegistered = false;
    private boolean mIsInMuteMode;
    private boolean mIsLowBitAmbient;

    private float mXOffset;
    private float mYOffset;

    private int mBackgroundColor = Color.parseColor(randomColor);
    private int mTextColor = Color.parseColor( "white" );

    final BroadcastReceiver mTimeZoneBroadcastReceiver = new BroadcastReceiver() {
        @Override
        public void onReceive(Context context, Intent intent) {
            mDisplayTime.clear( intent.getStringExtra( "time-zone" ) );
            mDisplayTime.setToNow();
        }
    };

    private final Handler mTimeHandler = new Handler() {
        @Override
        public void handleMessage(Message msg) {
            switch( msg.what ) {
                case MSG_UPDATE_TIME_ID: {
                    invalidate();
                    if( isVisible() && !isInAmbientMode() ) {
                        long currentTimeMillis = System.currentTimeMillis();
                        long delay = mUpdateRateMs - ( currentTimeMillis % mUpdateRateMs );
                        mTimeHandler.sendEmptyMessageDelayed( MSG_UPDATE_TIME_ID, delay );
                    }
                    break;
                }
            }
        }
    };











    @Override
    public void onCreate(SurfaceHolder holder) {
        super.onCreate(holder);
         /* load the background image */
        Resources resources = SomethingWatchFaceService.this.getResources();
        Drawable backgroundDrawable = resources.getDrawable(listDrawable[n]);
        mBackgroundBitmap = ((BitmapDrawable) backgroundDrawable).getBitmap();

        setWatchFaceStyle( new WatchFaceStyle.Builder( SomethingQuotesWatchFaceService.this )
                        .setBackgroundVisibility( WatchFaceStyle.BACKGROUND_VISIBILITY_INTERRUPTIVE )
                        .setCardPeekMode( WatchFaceStyle.PEEK_MODE_SHORT)
                        .setShowSystemUiTime( false )
                        .build()
        );

        initBackground();
        initDisplayText();

        mDisplayTime = new Time();

    }

    @Override
    public void onPropertiesChanged(Bundle properties) {
        super.onPropertiesChanged(properties);
        /* get device features (burn-in, low-bit ambient) */
        if( properties.getBoolean( PROPERTY_BURN_IN_PROTECTION, false ) ) {
            mIsLowBitAmbient = properties.getBoolean( PROPERTY_LOW_BIT_AMBIENT, false );
        }
    }

    @Override
    public void onTimeTick() {
        super.onTimeTick();
        /* the time changed */
        invalidate();
    }

    @Override
    public void onAmbientModeChanged(boolean inAmbientMode) {
        super.onAmbientModeChanged(inAmbientMode);
        // when Ambient Mode changes, we changes the color of the background paint.
        if( inAmbientMode ) {
            mTextColorPaint.setColor( Color.parseColor( "grey" ) );
            mBackgroundColorPaint.setColor( Color.parseColor( "black" ) );
        } else {
            mTextColorPaint.setColor( Color.parseColor( "white" ) );
            mBackgroundColorPaint.setColor( Color.parseColor( randomColor ) );
        }

        if( mIsLowBitAmbient ) {
            mTextColorPaint.setAntiAlias( !inAmbientMode );
        }

        invalidate();
        updateTimer();

    }


    @Override
    public void onDraw(Canvas canvas, Rect bounds) {

        drawBackground( canvas, bounds );

        int width = bounds.width();
        int height = bounds.height();

        // Draw the background, scaled to fit.

        if (mBackgroundScaledBitmap == null
                || mBackgroundScaledBitmap.getWidth() != width
                || mBackgroundScaledBitmap.getHeight() != height) {
            mBackgroundScaledBitmap = Bitmap.createScaledBitmap(mBackgroundBitmap,
                    width, height, true /* filter */);
        }
        canvas.drawBitmap(mBackgroundScaledBitmap, 0, 0, null);

        mDisplayTime.setToNow();


        drawTimeText( canvas );






    }
    private void initBackground() {
        mBackgroundColorPaint = new Paint();
        mBackgroundColorPaint.setColor( mBackgroundColor );
    }
    private void drawBackground( Canvas canvas, Rect bounds ) {
        canvas.drawRect( 0, 0, bounds.width(), bounds.height(), mBackgroundColorPaint );
    }



    private void initDisplayText() {
        mTextColorPaint = new Paint();
        mTextColorPaint.setColor( mTextColor );
        mTextColorPaint.setTypeface( WATCH_TEXT_TYPEFACE );
        mTextColorPaint.setAntiAlias( true );
        mTextColorPaint.setTextSize( getResources().getDimension( R.dimen.text_size ) );
    }

    private void updateTimer() {
        mTimeHandler.removeMessages( MSG_UPDATE_TIME_ID );
        if( isVisible() && !isInAmbientMode() ) {
            mTimeHandler.sendEmptyMessage( MSG_UPDATE_TIME_ID );
        }
    }



    private void drawTimeText( Canvas canvas ) {
        String timeText = getHourString() + ":" + String.format( "%02d", mDisplayTime.minute );
        if( isInAmbientMode() || mIsInMuteMode ) {
            timeText += ( mDisplayTime.hour < 12 ) ? "AM" : "PM";
        } else {
            timeText += String.format( ":%02d", mDisplayTime.second);
        }
        canvas.drawText( timeText, mXOffset, mYOffset, mTextColorPaint );
    }

    private String getHourString() {
        if( mDisplayTime.hour % 12 == 0 )
            return "12";
        else if( mDisplayTime.hour <= 12 )
            return String.valueOf( mDisplayTime.hour );
        else
            return String.valueOf( mDisplayTime.hour - 12 );
    }




    @Override
    public void onVisibilityChanged(boolean visible) {
        super.onVisibilityChanged(visible);
        /* the watch face became visible or invisible */
        if( visible ) {
            if( !mHasTimeZoneReceiverBeenRegistered ) {

                IntentFilter filter = new IntentFilter( Intent.ACTION_TIMEZONE_CHANGED );
                InspirationalQuotesWatchFaceService.this.registerReceiver( mTimeZoneBroadcastReceiver, filter );

                mHasTimeZoneReceiverBeenRegistered = true;
            }

            mDisplayTime.clear( TimeZone.getDefault().getID() );
            mDisplayTime.setToNow();
        } else {
            if( mHasTimeZoneReceiverBeenRegistered ) {
                SomethingQuotesWatchFaceService.this.unregisterReceiver( mTimeZoneBroadcastReceiver );
                mHasTimeZoneReceiverBeenRegistered = false;
            }
        }

        updateTimer();
    }

    @Override
    public void onApplyWindowInsets(WindowInsets insets) {
        super.onApplyWindowInsets(insets);

        mYOffset = getResources().getDimension( R.dimen.y_offset );

        if( insets.isRound() ) {
            mXOffset = getResources().getDimension( R.dimen.x_offset_round );
        } else {
            mXOffset = getResources().getDimension( R.dimen.x_offset_square );
        }
    }
}
相关问题