在bs4中获取元素之间的文本

时间:2018-09-08 09:16:50

标签: python beautifulsoup

HTML来源:

<script type="text/javascript">window._sharedData = {"activity_counts":null,"config":{"csrf_token":"P8DvqEB5AxkRuWyoNWhrZ3Bi2scbrVm9","viewer":null},"supports_es6":true,"country_code":"NL","language_code":"en","locale":"en_US","entry_data":{"ProfilePage":[{"logging_page_id":"profilePage_4469324900","show_suggested_profiles":false,"graphql":{"user":{"biography":"","blocked_by_viewer":false,"country_block":false,"external_url":null,"external_url_linkshimmed":null,"edge_followed_by":{"count":143},"followed_by_viewer":false,"edge_follow":{"count":43},"follows_viewer":false,"full_name":"\u0627\u062c\u0627\u0631\u0647 \u0648\u06cc\u0644\u0627 \u062f\u0631 \u06af\u0631\u062f\u0646\u0647 ..................  ;</script>

<script type="text/javascript">
  (function() {
    var docElement = document.documentElement;
    var classRE = new RegExp('(^|\\s)no-js(\\s|$)');
    var className = docElement.className;
    docElement.className = className.replace(classRE, '$1js$2');
  })();
</script>

现在,我希望输出显示{strong>仅在window._sharedData =之后的所有内容

输出:

{"activity_counts":null,"config":{"csrf_token":"P8DvqEB5AxkRuWyoNWhrZ3Bi2scbrVm9","viewer":null},"supports_es6":true,"count .......`

这是我的代码:

url = 'https://www.instagram.com/mehran_eblaghi/'
s = requests.session()
soup = bs(s.get(url).text, 'html.parser').findAll('script')
print(soup)

1 个答案:

答案 0 :(得分:1)

使用bs4查找第一个脚本标签,其文本以您要查找的内容开头,然后获取其文本内容并分割其开头,例如:

import json
import requests
import bs4

key = 'window._sharedData = '

soup = bs4.BeautifulSoup(requests.get(url).text, 'html.parser')
script_tag = soup.find('script', text=lambda L: L and L.startswith(key))
if script_tag:
    # raw string of data in script
    text_data = script_tag.text.partition(key)[2]
    # remove the trailing ; and you've json data... interpret as such
    data = json.loads(text_data.rstrip(';\n'))
else:
    # didn't find a match - up to you what to do here...

如果您设法找到了相关的脚本标签,那么data将是以下内容的Python字典:

{'activity_counts': None,
 'config': {'csrf_token': '1Srrhc6GQmmC19TdM3nLFsDOORtJMpCj', 'viewer': None},
 'supports_es6': False,
 'country_code': 'GB',
 'language_code': 'en',
 'locale': 'en_US',
 'entry_data': {'ProfilePage': [{'logging_page_id': 'profilePage_4469324900',
    'show_suggested_profiles': False,
    'graphql': {'user': {'biography': '',
      'blocked_by_viewer': False,
      'country_block': False,
      'external_url': None,
      'external_url_linkshimmed': None,
      'edge_followed_by': {'count': 143},
      'followed_by_viewer': False,
      'edge_follow': {'count': 43},
      'follows_viewer': False,
      'full_name': 'اجاره ویلا در گردنه حیران',
      'has_channel': False,
      'has_blocked_viewer': False,
      'highlight_reel_count': 0,
      'has_requested_viewer': False,
      'id': '4469324900',
      'is_business_account': False,
      'is_private': False,
      'is_verified': False,
      'edge_mutual_followed_by': {'count': 0, 'edges': []},
      'profile_pic_url': 'https://scontent-lht6-1.cdninstagram.com/vp/ee763d48bb0c35ac0c6aa22dc1e2ed08/5C31C768/t51.2885-19/s150x150/15876073_1641186492851073_2628164662507601920_n.jpg',
      'profile_pic_url_hd': 'https://scontent-lht6-1.cdninstagram.com/vp/fd5c97116848cf46ddf24f8ac8d1fd7e/5C35B210/t51.2885-19/s320x320/15876073_1641186492851073_2628164662507601920_n.jpg',
      'requested_by_viewer': False,
      'username': 'mehran_eblaghi',
      'connected_fb_page': None,
      'edge_owner_to_timeline_media': {'count': 2,
       'page_info': {'has_next_page': False,
        'end_cursor': 'AQBnocogeHdSL1DSSxRdiYR4D1RguUeEj5Ap1do1KIy4U_NutZIe9ZCyRpDExD4TL9k'},
       'edges': [{'node': {'__typename': 'GraphImage',
          'id': '1429655015362664538',
          'edge_media_to_caption': {'edges': [{'node': {'text': 'درصورت نیاز به ویلاتماس بگیرید 09112815125'}}]},
          'shortcode': 'BPXJ6luDBha',
          'edge_media_to_comment': {'count': 10},
          'comments_disabled': False,
          'taken_at_timestamp': 1484648180,
          'dimensions': {'height': 1080, 'width': 1080},
          'display_url': 'https://scontent-lht6-1.cdninstagram.com/vp/abeb67556e5e2166e497cc779e99fab2/5C33A30D/t51.2885-15/e35/14597426_594812037376264_3725484886300033024_n.jpg',
          'edge_liked_by': {'count': 42},
          'edge_media_preview_like': {'count': 42},
          'gating_info': None,
          'media_preview': 'ACoqZEv32OQcHHpViMrKoJxkE59cVnFC7HB6epP+NSBGhXdkc8e/5+lNRYc0UaYiBO7J9aR3DDg4IOCaoR3IOFwcr1x0wf8AGkF0vJCHB75POP5UrNF3XyNBpARwQe1VN+OKiEqA5AIbr+Hr0q7tVud/Xn7p/wAahptj0RnKcEkVKXwMY/lTEbHNSbt3OK6rbadF1OaMrXXNZ3elr/oN345A5/xpxk56cGoy5JpBKR9KVutvxKUntzO3+H0/p9SSRz0x/wDqqrg1P5mRj161HVxVuljKcru6lf8ACw8DNH61KANo+lMIFGvRrbt/wSLq7unu+v8AwBhIHamnHYfqKdimEClZ919z/wAwutrfl/kKKKYAKkqlfr+Vv1Ynbof/2Q==',
          'owner': {'id': '4469324900'},
          'thumbnail_src': 'https://scontent-lht6-1.cdninstagram.com/vp/a50ea8ec7e91454bc0b981b9a347c2b9/5C2CDBE8/t51.2885-15/sh0.08/e35/s640x640/14597426_594812037376264_3725484886300033024_n.jpg',
          'thumbnail_resources': [{'src': 'https://scontent-lht6-1.cdninstagram.com/vp/8ecae5da8cdf4f981a29ec7a0c6b0a08/5C30AF4F/t51.2885-15/e35/s150x150/14597426_594812037376264_3725484886300033024_n.jpg',
            'config_width': 150,
            'config_height': 150},
           {'src': 'https://scontent-lht6-1.cdninstagram.com/vp/fe3689ac4d9165c32369e8fc460f0040/5C187505/t51.2885-15/e35/s240x240/14597426_594812037376264_3725484886300033024_n.jpg',
            'config_width': 240,
            'config_height': 240},
           {'src': 'https://scontent-lht6-1.cdninstagram.com/vp/be7a47d6b422add7f77d597c0eecd21e/5C31FBBF/t51.2885-15/e35/s320x320/14597426_594812037376264_3725484886300033024_n.jpg',
            'config_width': 320,
            'config_height': 320},
           {'src': 'https://scontent-lht6-1.cdninstagram.com/vp/2f6d7c80500d9d56f940be6ffa0e8e9a/5C1568E5/t51.2885-15/e35/s480x480/14597426_594812037376264_3725484886300033024_n.jpg',
            'config_width': 480,
            'config_height': 480},
           {'src': 'https://scontent-lht6-1.cdninstagram.com/vp/a50ea8ec7e91454bc0b981b9a347c2b9/5C2CDBE8/t51.2885-15/sh0.08/e35/s640x640/14597426_594812037376264_3725484886300033024_n.jpg',
            'config_width': 640,
            'config_height': 640}],
          'is_video': False,
          'accessibility_caption': None}},
        {'node': {'__typename': 'GraphImage',
          'id': '1429628539162724247',
          'edge_media_to_caption': {'edges': []},
          'shortcode': 'BPXD5T1jgeX',
          'edge_media_to_comment': {'count': 3},
          'comments_disabled': False,
          'taken_at_timestamp': 1484645024,
          'dimensions': {'height': 1080, 'width': 1080},
          'display_url': 'https://scontent-lht6-1.cdninstagram.com/vp/b48766cc9da8d14904f702a927884f5b/5C2B24EA/t51.2885-15/e35/16110374_198276563977954_7548368730246348800_n.jpg',
          'edge_liked_by': {'count': 42},
          'edge_media_preview_like': {'count': 42},
          'gating_info': None,
          'media_preview': 'ACoqdDpYeEP0J5wTSHR2C5yPzP8AhVn7YVi45xgYNWbWbzjtII4z1BFVzMjQpxacigHILZ6c/wD1qr6jaFDuwMH0rdBjDYGMjg5pJYVkXDcjNF9bkOCvzLc4dhim5rR1K3EEmByD0rNq7miNRmLIQOScVYsJGik+YEbhgfnn+VY4ncd6kSaR+nJH6etZhaxqyXa/PzyW4/Opri+/dqqn+77dqwZFZTyOvI96aZGHB4x7UrBa5PevvlJ69P5VVp5DPz1pm0+hqwL6wRZ5GB7k/wD6qsRwRxncuc/Wq6E0McEY4osTctCJGYM7EsvTt/jUhhiJ3MN59W5/+tUAozSsFywWUYCgY/pTOKhYZGaQGixLP//Z',
          'owner': {'id': '4469324900'},
          'thumbnail_src': 'https://scontent-lht6-1.cdninstagram.com/vp/d37f58bf9a6bcbe17242a7e0b233c5c0/5C331E0F/t51.2885-15/sh0.08/e35/s640x640/16110374_198276563977954_7548368730246348800_n.jpg',
          'thumbnail_resources': [{'src': 'https://scontent-lht6-1.cdninstagram.com/vp/f14bd4b53c62c2fe56ba88f1a3ab85cf/5C1DC3A8/t51.2885-15/e35/s150x150/16110374_198276563977954_7548368730246348800_n.jpg',
            'config_width': 150,
            'config_height': 150},
           {'src': 'https://scontent-lht6-1.cdninstagram.com/vp/616bc4d9abe790d1c9e06dbb22e7b43f/5C266AE2/t51.2885-15/e35/s240x240/16110374_198276563977954_7548368730246348800_n.jpg',
            'config_width': 240,
            'config_height': 240},
           {'src': 'https://scontent-lht6-1.cdninstagram.com/vp/09d6473c69ad0b4e493f05c6d3aad9a4/5C205958/t51.2885-15/e35/s320x320/16110374_198276563977954_7548368730246348800_n.jpg',
            'config_width': 320,
            'config_height': 320},
           {'src': 'https://scontent-lht6-1.cdninstagram.com/vp/e5d6902499831040caded69325585dfc/5C350A02/t51.2885-15/e35/s480x480/16110374_198276563977954_7548368730246348800_n.jpg',
            'config_width': 480,
            'config_height': 480},
           {'src': 'https://scontent-lht6-1.cdninstagram.com/vp/d37f58bf9a6bcbe17242a7e0b233c5c0/5C331E0F/t51.2885-15/sh0.08/e35/s640x640/16110374_198276563977954_7548368730246348800_n.jpg',
            'config_width': 640,
            'config_height': 640}],
          'is_video': False,
          'accessibility_caption': None}}]},
      'edge_saved_media': {'count': 0,
       'page_info': {'has_next_page': False, 'end_cursor': None},
       'edges': []},
      'edge_media_collections': {'count': 0,
       'page_info': {'has_next_page': False, 'end_cursor': None},
       'edges': []}}},
    'felix_onboarding_video_resources': {'mp4': '/static/videos/felix-onboarding/onboardingVideo.mp4/9d16838ca7f9.mp4',
     'poster': '/static/images/felix-onboarding/onboardingVideoPoster.png/8fdba7cf2120.png'}}]},
 'gatekeepers': {'cb': True,
  'sf': True,
  'ld': True,
  'seo': True,
  'seoht': True,
  'saa': True,
  'phone_qp': True},
 'knobs': {'acct:ntb': 0, 'cb': 0, 'captcha': 0},
 'qe': {'form_navigation_dialog': {'g': '', 'p': {}},
  'cred_man': {'g': 'test', 'p': {'use_on_landing': 'true'}},
  'iab': {'g': '', 'p': {}},
  'app_upsell_li': {'g': '', 'p': {}},
  'app_upsell': {'g': '', 'p': {}},
  'stale_fix': {'g': '', 'p': {}},
  'profile_header_name': {'g': '', 'p': {}},
  'bc3l': {'g': '', 'p': {}},
  'direct_conversation_reporting': {'g': '', 'p': {}},
  'general_reporting': {'g': '', 'p': {}},
  'reporting': {'g': '', 'p': {}},
  'acc_recovery_link': {'g': '', 'p': {}},
  'notif': {'g': '', 'p': {}},
  'fb_unlink': {'g': '', 'p': {}},
  'mobile_stories_doodling': {'g': '', 'p': {}},
  'show_copy_link': {'g': '', 'p': {}},
  'mobile_logout': {'g': '', 'p': {}},
  'p_edit': {'g': '', 'p': {}},
  '404_as_react': {'g': '', 'p': {}},
  'acc_recovery': {'g': '', 'p': {}},
  'collections': {'g': '', 'p': {}},
  'comment_ta': {'g': '', 'p': {}},
  'su': {'g': '', 'p': {}},
  'disc_ppl': {'g': '', 'p': {}},
  'ebd_ul': {'g': 'launch', 'p': {'is_enabled': 'true'}},
  'ebdsim_li': {'g': '', 'p': {}},
  'ebdsim_lo': {'g': '', 'p': {}},
  'empty_feed': {'g': '', 'p': {}},
  'bundles': {'g': '', 'p': {}},
  'exit_story_creation': {'g': '', 'p': {}},
  'appsell': {'g': '', 'p': {}},
  'imgopt': {'g': '', 'p': {}},
  'follow_button': {'g': '', 'p': {}},
  'loggedout': {'g': '', 'p': {}},
  'loggedout_upsell': {'g': 'control_without_new_loggedout_upsell_content_03_15_18',
   'p': {'has_new_loggedout_upsell_content': 'false'}},
  'msisdn': {'g': '', 'p': {}},
  'bg_sync': {'g': '', 'p': {}},
  'onetaplogin': {'g': '', 'p': {}},
  'login_poe': {'g': '', 'p': {}},
  'private_lo': {'g': '', 'p': {}},
  'profile_tabs': {'g': '', 'p': {}},
  'push_notifications': {'g': '', 'p': {}},
  'reg': {'g': '', 'p': {}},
  'reg_vp': {'g': 'test_group_1', 'p': {'hide_value_prop': 'true'}},
  'report_media': {'g': '', 'p': {}},
  'report_profile': {'g': '', 'p': {}},
  'scroll_log': {'g': '', 'p': {}},
  'sidecar_swipe': {'g': '', 'p': {}},
  'su_universe': {'g': '', 'p': {}},
  'stale': {'g': '', 'p': {}},
  'stories_lo': {'g': 'test_05_01', 'p': {'location': 'true'}},
  'stories': {'g': '', 'p': {}},
  'tp_pblshr': {'g': '', 'p': {}},
  'video': {'g': '', 'p': {}},
  'gdpr_eu_tos': {'g': 'control_05_01',
   'p': {'gdpr_required': 'true',
    'eu_new_user_flow': 'age_two_button',
    'tos_version': 'eu'}},
  'gdpr_row_tos': {'g': '', 'p': {}},
  'fd_gr': {'g': '', 'p': {}},
  'felix': {'g': '', 'p': {}},
  'felix_clear_fb_cookie': {'g': '', 'p': {}},
  'felix_creation_duration_limits': {'g': '', 'p': {}},
  'felix_creation_enabled': {'g': '', 'p': {}},
  'felix_creation_fb_crossposting': {'g': '', 'p': {}},
  'felix_creation_fb_crossposting_v2': {'g': '', 'p': {}},
  'felix_creation_validation': {'g': '', 'p': {}},
  'felix_creation_video_upload': {'g': '', 'p': {}},
  'felix_early_onboarding': {'g': '', 'p': {}},
  'unfollow_confirm': {'g': '', 'p': {}},
  'profile_enhance_li': {'g': '', 'p': {}},
  'profile_enhance_lo': {'g': '', 'p': {}},
  'phone_confirm': {'g': '', 'p': {}},
  'comment_enhance': {'g': '', 'p': {}},
  'mweb_topical_explore': {'g': '', 'p': {}},
  'web_nametag': {'g': '', 'p': {}},
  'image_downgrade': {'g': '', 'p': {}},
  'image_downgrade_lite': {'g': '', 'p': {}},
  'follow_all_fb': {'g': '', 'p': {}},
  'lite_direct_upsell': {'g': '', 'p': {}},
  'web_loggedout_noop': {'g': '', 'p': {}},
  'stories_video_preload': {'g': '', 'p': {}},
  'lite_stories_video_preload': {'g': '', 'p': {}},
  'a2hs_heuristic_uc': {'g': '', 'p': {}},
  'a2hs_heuristic_non_uc': {'g': '', 'p': {}},
  'web_hashtag': {'g': '', 'p': {}},
  'header_scroll': {'g': '', 'p': {}},
  'rout': {'g': '', 'p': {}},
  'websr': {'g': '', 'p': {}},
  'web_lo_follow': {'g': '', 'p': {}},
  'web_share': {'g': '', 'p': {}},
  'lite_rating': {'g': '', 'p': {}},
  'web_embeds_share': {'g': '', 'p': {}},
  'web_share_lo': {'g': '', 'p': {}},
  'web_embeds_logged_out': {'g': 'test_comment_input',
   'p': {'show_comment_input': 'true'}},
  'sl': {'g': '', 'p': {}},
  'reg_nux': {'g': '', 'p': {}},
  'web_datasaver_mode': {'g': '', 'p': {}},
  'lite_datasaver_mode': {'g': '', 'p': {}},
  'lite_video_upload': {'g': '', 'p': {}}},
 'hostname': 'www.instagram.com',
 'platform': 'web',
 'rhx_gis': 'b9d7a25d3e0772990918069a0652bc21',
 'nonce': 'E+077618aJD12ZjcMWUynA==',
 'zero_data': {},
 'rollout_hash': '2502ae2429f4',
 'bundle_variant': 'base',
 'probably_has_app': False}