防止抓取PHP?

时间:2018-08-29 04:41:03

标签: bash shell web-crawler

我在网上找到了此脚本,该脚本非常适合于爬网网站并提供页面列表,但对于某些网站而言,它有点发疯,并且索引了以.php?rp = / knowledgebase&u = 94890结尾的页面以及类似的变体。最终索引了数百个无用的页面以及同一页面的变体。 在fetchSiteUrls()函数中有一个grep排除列表,其中排除了一些URL,我尝试将其添加到该列表中,但无法正常工作。有什么建议吗?

#!/bin/bash    

displaySpinner()
{
  local pid=$!
  local delay=0.3
  local spinstr='|/-\'
  while [ "$(ps a | awk '{print $1}' | grep $pid)" ]; do
      local temp=${spinstr#?}
      printf "#    Please wait... [%c]  " "$spinstr" # Count number of backspaces needed (A = 25)
      local spinstr=$temp${spinstr%"$temp"}
      sleep $delay
      printf "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b" # Number of backspaces from (A)
  done
  printf "                         \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b" # Number of spaces, then backspaces from (A)
} # // displaySpinner()

savelocation=~/Desktop

fetchSiteUrls() {
  cd $savelocation && wget --spider -r -nd --max-redirect=30 $DOMAIN 2>&1 \
  | grep '^--' \
  | awk '{ print $3 }' \
  | grep -E -v '\.(css|js|map|xml|png|gif|jpg|JPG|bmp|txt|pdf)(\?.*)?$' \
  | grep -E -v '\?(p|replytocom)=' \
  | grep -E -v '\/wp-content\/uploads\/' \
  | grep -E -v '\/feed\/' \
  | grep -E -v '\/category\/' \
  | grep -E -v '\/tag\/' \
  | grep -E -v '\/page\/' \
  | grep -E -v '\/widgets.php$' \
  | grep -E -v '\/wp-json\/' \
  | grep -E -v '\/xmlrpc' \
  | sort -u \
  > $savelocation/$1.txt
} # // fetchSiteUrls()

# Prompt user for domain
echo "#    "
echo "#    Fetch a list of unique URLs for a domain."
echo "#    "
echo "#    Enter the full URL ( http://example.com )"

read -e -p "#    URL: " DOMAIN
DOMAIN=$DOMAIN
displaydomain=$(echo ${DOMAIN} | grep -oP "^http(s)?://(www\.)?\K.*")
filename=$(echo ${DOMAIN} | grep -oP "^http(s)?://(www\.)?\K.*" | tr "." "-")

echo "#    "
read -e -p "#    Save txt file as: " -i "${filename}" SAVEFILENAME
savefilename=$SAVEFILENAME

echo "#    "
echo "#    Fetching URLs for ${displaydomain} "

# Start process
fetchSiteUrls $savefilename & displaySpinner

# Process is complete, output message
echo "#    Finished!"
echo "#    "
echo "#    File Location: ${savelocation}/$savefilename.txt"
echo "#    "

0 个答案:

没有答案